Import python-pcre2_0.4.0+ds.orig.tar.xz
authorMichael R. Crusoe <crusoe@debian.org>
Tue, 19 Nov 2024 11:44:45 +0000 (12:44 +0100)
committerMichael R. Crusoe <crusoe@debian.org>
Tue, 19 Nov 2024 11:44:45 +0000 (12:44 +0100)
[dgit import orig python-pcre2_0.4.0+ds.orig.tar.xz]

34 files changed:
CMakeLists.txt [new file with mode: 0644]
LICENSE [new file with mode: 0644]
Makefile [new file with mode: 0644]
PKG-INFO [new file with mode: 0644]
README.md [new file with mode: 0755]
pyproject.toml [new file with mode: 0755]
requirements/build-requirements.txt [new file with mode: 0644]
requirements/test-requirements.txt [new file with mode: 0644]
setup.cfg [new file with mode: 0644]
setup.py [new file with mode: 0755]
src/pcre2.egg-info/PKG-INFO [new file with mode: 0644]
src/pcre2.egg-info/SOURCES.txt [new file with mode: 0644]
src/pcre2.egg-info/dependency_links.txt [new file with mode: 0644]
src/pcre2.egg-info/top_level.txt [new file with mode: 0644]
src/pcre2/CMakeLists.txt [new file with mode: 0644]
src/pcre2/__init__.py [new file with mode: 0755]
src/pcre2/consts.pxd [new file with mode: 0644]
src/pcre2/consts.pyx [new file with mode: 0644]
src/pcre2/exceptions.pxd [new file with mode: 0755]
src/pcre2/exceptions.pyx [new file with mode: 0755]
src/pcre2/libpcre2.pxd [new file with mode: 0755]
src/pcre2/match.pxd [new file with mode: 0644]
src/pcre2/match.pyx [new file with mode: 0644]
src/pcre2/methods.pxd [new file with mode: 0644]
src/pcre2/methods.pyx [new file with mode: 0644]
src/pcre2/pattern.pxd [new file with mode: 0644]
src/pcre2/pattern.pyx [new file with mode: 0644]
src/pcre2/scanner.pxd [new file with mode: 0644]
src/pcre2/scanner.pyx [new file with mode: 0644]
src/pcre2/utils.pxd [new file with mode: 0755]
src/pcre2/utils.pyx [new file with mode: 0755]
tests/test_groups.py [new file with mode: 0644]
tests/test_match.py [new file with mode: 0644]
tests/test_pattern.py [new file with mode: 0644]

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644 (file)
index 0000000..1b59b31
--- /dev/null
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.7.2)
+
+project(pcre2)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_C_STANDARD 99)
+
+set(PCRE2_INCLUDE_DIR ${CMAKE_BINARY_DIR}/src/libpcre2)
+set(CYTHON_EXTRA_COMPILE_ARGS -DPCRE2_CODE_UNIT_WIDTH=8 -fPIC)
+
+# Set PCRE2 options.
+set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE)
+set(PCRE2_NEVER_BACKSLASH_C ON CACHE BOOL "" FORCE)
+
+# Always make a release build.
+set(CMAKE_BUILD_TYPE Release)
+
+# Build PCRE2 library as both shared and static.
+set(BUILD_STATIC_LIBS ON)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/libpcre2)
+
+# Build Cython code as shared.
+set(BUILD_STATIC_LIBS OFF)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/pcre2)
+
+# Include PCRE2 header for Cython API.
+install(FILES ${PCRE2_INCLUDE_DIR}/pcre2.h DESTINATION src/pcre2)
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..4a57011
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, grtetrault
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..eb2eb2a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+SHELL = /bin/bash
+
+init:
+       git submodule update --init
+       python3 -m venv ./.venv
+       ./.venv/bin/pip install -r ./requirements/build-requirements.txt
+       ./.venv/bin/pip install -r ./requirements/test-requirements.txt
+       ./.venv/bin/pip install .
+
+build:
+       ./.venv/bin/pip install . --force-reinstall
+
+clean:
+       rm -rf ./dist
+       rm -rf ./build
+       rm -rf ./_skbuild
+       find ./src/pcre2 -type f -name '*.c' -print0 | xargs -0 rm -vf
+       find ./src/pcre2 -type f -name '*.html' -print0 | xargs -0 rm -vf
+       find . -type f -name '*.pyc' | xargs rm -r
+       find . -type d -name '*.egg-info' | xargs rm -r
+       find . -type d -name '*.ipynb_checkpoints' | xargs rm -r
+
+purge:
+       rm -rf ./.venv
+
+benchmark:
+       ./.venv/bin/python ./benchmarks/run_regex_redux.py
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644 (file)
index 0000000..0a6cd5b
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,134 @@
+Metadata-Version: 2.1
+Name: pcre2
+Version: 0.4.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit_size
+980
+>>> patn.name_dict()
+{1: 'head', 2: 'tail'}
+>>> patn.options
+524296
+>>> # Deeper inspection into options is available.
+>>> pcre2.CompileOption.decompose(patn.options)
+[<CompileOption.CASELESS: 0x8>, <CompileOption.UTF: 0x80000>]
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match.substring()
+'foo bar'
+>>> match.start(), match.end()
+(8, 17)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.substitute(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.substitute(repl, subj, suball=False)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo buzz bazz'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.scan(subj):
+...     print(match.substring('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script              | Number of runs | Total time | Real time  | User time   | System time   |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| `baseline.py`       |             10 |      3.020 |      0.302 |       0.020 |         0.086 |
+| `vanilla.py`        |             10 |     51.380 |      5.138 |      11.408 |         0.529 |
+| `hand_optimized.py` |             10 |     13.190 |      1.319 |       2.846 |         0.344 |
+| `pcre2_module.py`   |             10 |     13.670 |      1.367 |       2.269 |         0.532 |
+Script descriptions are as follows,
+
+| Script              | Description                                                          |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py`       | Reads input file and outputs stored expected output                  |
+| `vanilla.py`        | Pure Python version                                                  |
+| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+| `pcre2_module.py`   | Implementation using Python bindings written here                    |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
diff --git a/README.md b/README.md
new file mode 100755 (executable)
index 0000000..e207b84
--- /dev/null
+++ b/README.md
@@ -0,0 +1,110 @@
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit_size
+980
+>>> patn.name_dict()
+{1: 'head', 2: 'tail'}
+>>> patn.options
+524296
+>>> # Deeper inspection into options is available.
+>>> pcre2.CompileOption.decompose(patn.options)
+[<CompileOption.CASELESS: 0x8>, <CompileOption.UTF: 0x80000>]
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match.substring()
+'foo bar'
+>>> match.start(), match.end()
+(8, 17)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.substitute(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.substitute(repl, subj, suball=False)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo buzz bazz'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.scan(subj):
+...     print(match.substring('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script              | Number of runs | Total time | Real time  | User time   | System time   |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| `baseline.py`       |             10 |      3.020 |      0.302 |       0.020 |         0.086 |
+| `vanilla.py`        |             10 |     51.380 |      5.138 |      11.408 |         0.529 |
+| `hand_optimized.py` |             10 |     13.190 |      1.319 |       2.846 |         0.344 |
+| `pcre2_module.py`   |             10 |     13.670 |      1.367 |       2.269 |         0.532 |
+Script descriptions are as follows,
+
+| Script              | Description                                                          |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py`       | Reads input file and outputs stored expected output                  |
+| `vanilla.py`        | Pure Python version                                                  |
+| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+| `pcre2_module.py`   | Implementation using Python bindings written here                    |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100755 (executable)
index 0000000..c0f420a
--- /dev/null
@@ -0,0 +1,8 @@
+[build-system]
+requires = [
+  "setuptools>=42",
+  "scikit-build",
+  "Cython",
+  "cmake"
+]
+build-backend = "setuptools.build_meta"
diff --git a/requirements/build-requirements.txt b/requirements/build-requirements.txt
new file mode 100644 (file)
index 0000000..067a22d
--- /dev/null
@@ -0,0 +1,6 @@
+requests
+build
+wheel
+scikit-build
+cmake
+Cython
\ No newline at end of file
diff --git a/requirements/test-requirements.txt b/requirements/test-requirements.txt
new file mode 100644 (file)
index 0000000..209b771
--- /dev/null
@@ -0,0 +1,3 @@
+twine
+pytest
+gitpython
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644 (file)
index 0000000..8bfd5a1
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,4 @@
+[egg_info]
+tag_build = 
+tag_date = 0
+
diff --git a/setup.py b/setup.py
new file mode 100755 (executable)
index 0000000..59534c6
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,46 @@
+# -*- coding:utf-8 -*-
+
+import os
+import skbuild
+import setuptools
+
+
+def get_long_desciption():
+    cwd = os.path.abspath(os.path.dirname(__file__))
+    filename = os.path.join(cwd, "README.md")
+    with open(filename) as f:
+        long_description = f.read()
+
+    return long_description
+
+
+skbuild.setup(
+    name = "pcre2",
+    version = "0.4.0",
+    description = "Python bindings for the PCRE2 regular expression library",
+    long_description = get_long_desciption(),
+    long_description_content_type = "text/markdown",
+    license = "BSD 3-Clause License",
+    author = "Garrett Tetrault",
+    url = "https://github.com/grtetrault/pcre2.py",
+    classifiers = [
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: C",
+        "Programming Language :: Cython",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Operating System :: MacOS :: MacOS X",
+        "Operating System :: POSIX :: Linux",
+        "Operating System :: Microsoft :: Windows"
+    ],
+    include_package_data=True,
+    packages = setuptools.find_packages("src"),
+    package_dir = {"": "src"},
+    cmake_languages = "C",
+)
diff --git a/src/pcre2.egg-info/PKG-INFO b/src/pcre2.egg-info/PKG-INFO
new file mode 100644 (file)
index 0000000..0a6cd5b
--- /dev/null
@@ -0,0 +1,134 @@
+Metadata-Version: 2.1
+Name: pcre2
+Version: 0.4.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, options=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit_size
+980
+>>> patn.name_dict()
+{1: 'head', 2: 'tail'}
+>>> patn.options
+524296
+>>> # Deeper inspection into options is available.
+>>> pcre2.CompileOption.decompose(patn.options)
+[<CompileOption.CASELESS: 0x8>, <CompileOption.UTF: 0x80000>]
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match.substring()
+'foo bar'
+>>> match.start(), match.end()
+(8, 17)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.substitute(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.substitute(repl, subj, suball=False)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo buzz bazz'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.scan(subj):
+...     print(match.substring('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script              | Number of runs | Total time | Real time  | User time   | System time   |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| `baseline.py`       |             10 |      3.020 |      0.302 |       0.020 |         0.086 |
+| `vanilla.py`        |             10 |     51.380 |      5.138 |      11.408 |         0.529 |
+| `hand_optimized.py` |             10 |     13.190 |      1.319 |       2.846 |         0.344 |
+| `pcre2_module.py`   |             10 |     13.670 |      1.367 |       2.269 |         0.532 |
+Script descriptions are as follows,
+
+| Script              | Description                                                          |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py`       | Reads input file and outputs stored expected output                  |
+| `vanilla.py`        | Pure Python version                                                  |
+| `hand_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+| `pcre2_module.py`   | Implementation using Python bindings written here                    |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
diff --git a/src/pcre2.egg-info/SOURCES.txt b/src/pcre2.egg-info/SOURCES.txt
new file mode 100644 (file)
index 0000000..391a133
--- /dev/null
@@ -0,0 +1,498 @@
+CMakeLists.txt
+LICENSE
+Makefile
+README.md
+pyproject.toml
+setup.py
+requirements/build-requirements.txt
+requirements/test-requirements.txt
+src/libpcre2/.bazelrc
+src/libpcre2/.git
+src/libpcre2/.gitignore
+src/libpcre2/132html
+src/libpcre2/AUTHORS
+src/libpcre2/BUILD.bazel
+src/libpcre2/CMakeLists.txt
+src/libpcre2/COPYING
+src/libpcre2/ChangeLog
+src/libpcre2/CheckMan
+src/libpcre2/CleanTxt
+src/libpcre2/Detrail
+src/libpcre2/HACKING
+src/libpcre2/LICENCE
+src/libpcre2/MODULE.bazel
+src/libpcre2/Makefile.am
+src/libpcre2/NEWS
+src/libpcre2/NON-AUTOTOOLS-BUILD
+src/libpcre2/PrepareRelease
+src/libpcre2/README
+src/libpcre2/README.md
+src/libpcre2/RunGrepTest
+src/libpcre2/RunGrepTest.bat
+src/libpcre2/RunTest
+src/libpcre2/RunTest.bat
+src/libpcre2/WORKSPACE.bazel
+src/libpcre2/autogen.sh
+src/libpcre2/build.zig
+src/libpcre2/config-cmake.h.in
+src/libpcre2/configure.ac
+src/libpcre2/index.md
+src/libpcre2/libpcre2-16.pc.in
+src/libpcre2/libpcre2-32.pc.in
+src/libpcre2/libpcre2-8.pc.in
+src/libpcre2/libpcre2-posix.pc.in
+src/libpcre2/pcre2-config.in
+src/libpcre2/pcre2_fuzzer.dict
+src/libpcre2/pcre2_fuzzer.options
+src/libpcre2/pcre2_fuzzer_16.dict
+src/libpcre2/pcre2_fuzzer_16.options
+src/libpcre2/pcre2_fuzzer_32.dict
+src/libpcre2/pcre2_fuzzer_32.options
+src/libpcre2/perltest.sh
+src/libpcre2/.github/workflows/build.yml
+src/libpcre2/.github/workflows/cifuzz.yml
+src/libpcre2/.github/workflows/codeql.yml
+src/libpcre2/.github/workflows/dev.yml
+src/libpcre2/.github/workflows/scorecards.yml
+src/libpcre2/cmake/COPYING-CMAKE-SCRIPTS
+src/libpcre2/cmake/FindEditline.cmake
+src/libpcre2/cmake/FindPackageHandleStandardArgs.cmake
+src/libpcre2/cmake/FindReadline.cmake
+src/libpcre2/cmake/pcre2-config-version.cmake.in
+src/libpcre2/cmake/pcre2-config.cmake.in
+src/libpcre2/doc/index.html.src
+src/libpcre2/doc/pcre2-config.1
+src/libpcre2/doc/pcre2-config.txt
+src/libpcre2/doc/pcre2.3
+src/libpcre2/doc/pcre2.txt
+src/libpcre2/doc/pcre2_callout_enumerate.3
+src/libpcre2/doc/pcre2_code_copy.3
+src/libpcre2/doc/pcre2_code_copy_with_tables.3
+src/libpcre2/doc/pcre2_code_free.3
+src/libpcre2/doc/pcre2_compile.3
+src/libpcre2/doc/pcre2_compile_context_copy.3
+src/libpcre2/doc/pcre2_compile_context_create.3
+src/libpcre2/doc/pcre2_compile_context_free.3
+src/libpcre2/doc/pcre2_config.3
+src/libpcre2/doc/pcre2_convert_context_copy.3
+src/libpcre2/doc/pcre2_convert_context_create.3
+src/libpcre2/doc/pcre2_convert_context_free.3
+src/libpcre2/doc/pcre2_converted_pattern_free.3
+src/libpcre2/doc/pcre2_dfa_match.3
+src/libpcre2/doc/pcre2_general_context_copy.3
+src/libpcre2/doc/pcre2_general_context_create.3
+src/libpcre2/doc/pcre2_general_context_free.3
+src/libpcre2/doc/pcre2_get_error_message.3
+src/libpcre2/doc/pcre2_get_mark.3
+src/libpcre2/doc/pcre2_get_match_data_heapframes_size.3
+src/libpcre2/doc/pcre2_get_match_data_size.3
+src/libpcre2/doc/pcre2_get_ovector_count.3
+src/libpcre2/doc/pcre2_get_ovector_pointer.3
+src/libpcre2/doc/pcre2_get_startchar.3
+src/libpcre2/doc/pcre2_jit_compile.3
+src/libpcre2/doc/pcre2_jit_free_unused_memory.3
+src/libpcre2/doc/pcre2_jit_match.3
+src/libpcre2/doc/pcre2_jit_stack_assign.3
+src/libpcre2/doc/pcre2_jit_stack_create.3
+src/libpcre2/doc/pcre2_jit_stack_free.3
+src/libpcre2/doc/pcre2_maketables.3
+src/libpcre2/doc/pcre2_maketables_free.3
+src/libpcre2/doc/pcre2_match.3
+src/libpcre2/doc/pcre2_match_context_copy.3
+src/libpcre2/doc/pcre2_match_context_create.3
+src/libpcre2/doc/pcre2_match_context_free.3
+src/libpcre2/doc/pcre2_match_data_create.3
+src/libpcre2/doc/pcre2_match_data_create_from_pattern.3
+src/libpcre2/doc/pcre2_match_data_free.3
+src/libpcre2/doc/pcre2_pattern_convert.3
+src/libpcre2/doc/pcre2_pattern_info.3
+src/libpcre2/doc/pcre2_serialize_decode.3
+src/libpcre2/doc/pcre2_serialize_encode.3
+src/libpcre2/doc/pcre2_serialize_free.3
+src/libpcre2/doc/pcre2_serialize_get_number_of_codes.3
+src/libpcre2/doc/pcre2_set_bsr.3
+src/libpcre2/doc/pcre2_set_callout.3
+src/libpcre2/doc/pcre2_set_character_tables.3
+src/libpcre2/doc/pcre2_set_compile_extra_options.3
+src/libpcre2/doc/pcre2_set_compile_recursion_guard.3
+src/libpcre2/doc/pcre2_set_depth_limit.3
+src/libpcre2/doc/pcre2_set_glob_escape.3
+src/libpcre2/doc/pcre2_set_glob_separator.3
+src/libpcre2/doc/pcre2_set_heap_limit.3
+src/libpcre2/doc/pcre2_set_match_limit.3
+src/libpcre2/doc/pcre2_set_max_pattern_length.3
+src/libpcre2/doc/pcre2_set_max_varlookbehind.3
+src/libpcre2/doc/pcre2_set_newline.3
+src/libpcre2/doc/pcre2_set_offset_limit.3
+src/libpcre2/doc/pcre2_set_parens_nest_limit.3
+src/libpcre2/doc/pcre2_set_recursion_limit.3
+src/libpcre2/doc/pcre2_set_recursion_memory_management.3
+src/libpcre2/doc/pcre2_set_substitute_callout.3
+src/libpcre2/doc/pcre2_substitute.3
+src/libpcre2/doc/pcre2_substring_copy_byname.3
+src/libpcre2/doc/pcre2_substring_copy_bynumber.3
+src/libpcre2/doc/pcre2_substring_free.3
+src/libpcre2/doc/pcre2_substring_get_byname.3
+src/libpcre2/doc/pcre2_substring_get_bynumber.3
+src/libpcre2/doc/pcre2_substring_length_byname.3
+src/libpcre2/doc/pcre2_substring_length_bynumber.3
+src/libpcre2/doc/pcre2_substring_list_free.3
+src/libpcre2/doc/pcre2_substring_list_get.3
+src/libpcre2/doc/pcre2_substring_nametable_scan.3
+src/libpcre2/doc/pcre2_substring_number_from_name.3
+src/libpcre2/doc/pcre2api.3
+src/libpcre2/doc/pcre2build.3
+src/libpcre2/doc/pcre2callout.3
+src/libpcre2/doc/pcre2compat.3
+src/libpcre2/doc/pcre2convert.3
+src/libpcre2/doc/pcre2demo.3
+src/libpcre2/doc/pcre2grep.1
+src/libpcre2/doc/pcre2grep.txt
+src/libpcre2/doc/pcre2jit.3
+src/libpcre2/doc/pcre2limits.3
+src/libpcre2/doc/pcre2matching.3
+src/libpcre2/doc/pcre2partial.3
+src/libpcre2/doc/pcre2pattern.3
+src/libpcre2/doc/pcre2perform.3
+src/libpcre2/doc/pcre2posix.3
+src/libpcre2/doc/pcre2sample.3
+src/libpcre2/doc/pcre2serialize.3
+src/libpcre2/doc/pcre2syntax.3
+src/libpcre2/doc/pcre2test.1
+src/libpcre2/doc/pcre2test.txt
+src/libpcre2/doc/pcre2unicode.3
+src/libpcre2/doc/html/NON-AUTOTOOLS-BUILD.txt
+src/libpcre2/doc/html/README.txt
+src/libpcre2/doc/html/index.html
+src/libpcre2/doc/html/pcre2-config.html
+src/libpcre2/doc/html/pcre2.html
+src/libpcre2/doc/html/pcre2_callout_enumerate.html
+src/libpcre2/doc/html/pcre2_code_copy.html
+src/libpcre2/doc/html/pcre2_code_copy_with_tables.html
+src/libpcre2/doc/html/pcre2_code_free.html
+src/libpcre2/doc/html/pcre2_compile.html
+src/libpcre2/doc/html/pcre2_compile_context_copy.html
+src/libpcre2/doc/html/pcre2_compile_context_create.html
+src/libpcre2/doc/html/pcre2_compile_context_free.html
+src/libpcre2/doc/html/pcre2_config.html
+src/libpcre2/doc/html/pcre2_convert_context_copy.html
+src/libpcre2/doc/html/pcre2_convert_context_create.html
+src/libpcre2/doc/html/pcre2_convert_context_free.html
+src/libpcre2/doc/html/pcre2_converted_pattern_free.html
+src/libpcre2/doc/html/pcre2_dfa_match.html
+src/libpcre2/doc/html/pcre2_general_context_copy.html
+src/libpcre2/doc/html/pcre2_general_context_create.html
+src/libpcre2/doc/html/pcre2_general_context_free.html
+src/libpcre2/doc/html/pcre2_get_error_message.html
+src/libpcre2/doc/html/pcre2_get_mark.html
+src/libpcre2/doc/html/pcre2_get_match_data_heapframes_size.html
+src/libpcre2/doc/html/pcre2_get_match_data_size.html
+src/libpcre2/doc/html/pcre2_get_ovector_count.html
+src/libpcre2/doc/html/pcre2_get_ovector_pointer.html
+src/libpcre2/doc/html/pcre2_get_startchar.html
+src/libpcre2/doc/html/pcre2_jit_compile.html
+src/libpcre2/doc/html/pcre2_jit_free_unused_memory.html
+src/libpcre2/doc/html/pcre2_jit_match.html
+src/libpcre2/doc/html/pcre2_jit_stack_assign.html
+src/libpcre2/doc/html/pcre2_jit_stack_create.html
+src/libpcre2/doc/html/pcre2_jit_stack_free.html
+src/libpcre2/doc/html/pcre2_maketables.html
+src/libpcre2/doc/html/pcre2_maketables_free.html
+src/libpcre2/doc/html/pcre2_match.html
+src/libpcre2/doc/html/pcre2_match_context_copy.html
+src/libpcre2/doc/html/pcre2_match_context_create.html
+src/libpcre2/doc/html/pcre2_match_context_free.html
+src/libpcre2/doc/html/pcre2_match_data_create.html
+src/libpcre2/doc/html/pcre2_match_data_create_from_pattern.html
+src/libpcre2/doc/html/pcre2_match_data_free.html
+src/libpcre2/doc/html/pcre2_pattern_convert.html
+src/libpcre2/doc/html/pcre2_pattern_info.html
+src/libpcre2/doc/html/pcre2_serialize_decode.html
+src/libpcre2/doc/html/pcre2_serialize_encode.html
+src/libpcre2/doc/html/pcre2_serialize_free.html
+src/libpcre2/doc/html/pcre2_serialize_get_number_of_codes.html
+src/libpcre2/doc/html/pcre2_set_bsr.html
+src/libpcre2/doc/html/pcre2_set_callout.html
+src/libpcre2/doc/html/pcre2_set_character_tables.html
+src/libpcre2/doc/html/pcre2_set_compile_extra_options.html
+src/libpcre2/doc/html/pcre2_set_compile_recursion_guard.html
+src/libpcre2/doc/html/pcre2_set_depth_limit.html
+src/libpcre2/doc/html/pcre2_set_glob_escape.html
+src/libpcre2/doc/html/pcre2_set_glob_separator.html
+src/libpcre2/doc/html/pcre2_set_heap_limit.html
+src/libpcre2/doc/html/pcre2_set_match_limit.html
+src/libpcre2/doc/html/pcre2_set_max_pattern_length.html
+src/libpcre2/doc/html/pcre2_set_max_varlookbehind.html
+src/libpcre2/doc/html/pcre2_set_newline.html
+src/libpcre2/doc/html/pcre2_set_offset_limit.html
+src/libpcre2/doc/html/pcre2_set_parens_nest_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_memory_management.html
+src/libpcre2/doc/html/pcre2_set_substitute_callout.html
+src/libpcre2/doc/html/pcre2_substitute.html
+src/libpcre2/doc/html/pcre2_substring_copy_byname.html
+src/libpcre2/doc/html/pcre2_substring_copy_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_free.html
+src/libpcre2/doc/html/pcre2_substring_get_byname.html
+src/libpcre2/doc/html/pcre2_substring_get_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_length_byname.html
+src/libpcre2/doc/html/pcre2_substring_length_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_list_free.html
+src/libpcre2/doc/html/pcre2_substring_list_get.html
+src/libpcre2/doc/html/pcre2_substring_nametable_scan.html
+src/libpcre2/doc/html/pcre2_substring_number_from_name.html
+src/libpcre2/doc/html/pcre2api.html
+src/libpcre2/doc/html/pcre2build.html
+src/libpcre2/doc/html/pcre2callout.html
+src/libpcre2/doc/html/pcre2compat.html
+src/libpcre2/doc/html/pcre2convert.html
+src/libpcre2/doc/html/pcre2demo.html
+src/libpcre2/doc/html/pcre2grep.html
+src/libpcre2/doc/html/pcre2jit.html
+src/libpcre2/doc/html/pcre2limits.html
+src/libpcre2/doc/html/pcre2matching.html
+src/libpcre2/doc/html/pcre2partial.html
+src/libpcre2/doc/html/pcre2pattern.html
+src/libpcre2/doc/html/pcre2perform.html
+src/libpcre2/doc/html/pcre2posix.html
+src/libpcre2/doc/html/pcre2sample.html
+src/libpcre2/doc/html/pcre2serialize.html
+src/libpcre2/doc/html/pcre2syntax.html
+src/libpcre2/doc/html/pcre2test.html
+src/libpcre2/doc/html/pcre2unicode.html
+src/libpcre2/m4/ax_pthread.m4
+src/libpcre2/m4/pcre2_visibility.m4
+src/libpcre2/maint/GenerateCommon.py
+src/libpcre2/maint/GenerateTest26.py
+src/libpcre2/maint/GenerateUcd.py
+src/libpcre2/maint/GenerateUcpHeader.py
+src/libpcre2/maint/GenerateUcpTables.py
+src/libpcre2/maint/ManyConfigTests
+src/libpcre2/maint/README
+src/libpcre2/maint/pcre2_chartables.c.non-standard
+src/libpcre2/maint/ucptest.c
+src/libpcre2/maint/utf8.c
+src/libpcre2/maint/Unicode.tables/BidiMirroring.txt
+src/libpcre2/maint/Unicode.tables/CaseFolding.txt
+src/libpcre2/maint/Unicode.tables/DerivedBidiClass.txt
+src/libpcre2/maint/Unicode.tables/DerivedCoreProperties.txt
+src/libpcre2/maint/Unicode.tables/DerivedGeneralCategory.txt
+src/libpcre2/maint/Unicode.tables/GraphemeBreakProperty.txt
+src/libpcre2/maint/Unicode.tables/PropList.txt
+src/libpcre2/maint/Unicode.tables/PropertyAliases.txt
+src/libpcre2/maint/Unicode.tables/PropertyValueAliases.txt
+src/libpcre2/maint/Unicode.tables/ScriptExtensions.txt
+src/libpcre2/maint/Unicode.tables/Scripts.txt
+src/libpcre2/maint/Unicode.tables/UnicodeData.txt
+src/libpcre2/maint/Unicode.tables/emoji-data.txt
+src/libpcre2/maint/ucptestdata/testinput1
+src/libpcre2/maint/ucptestdata/testinput2
+src/libpcre2/maint/ucptestdata/testoutput1
+src/libpcre2/maint/ucptestdata/testoutput2
+src/libpcre2/src/config.h.generic
+src/libpcre2/src/config.h.in
+src/libpcre2/src/pcre2.h.generic
+src/libpcre2/src/pcre2.h.in
+src/libpcre2/src/pcre2_auto_possess.c
+src/libpcre2/src/pcre2_chartables.c.dist
+src/libpcre2/src/pcre2_chkdint.c
+src/libpcre2/src/pcre2_compile.c
+src/libpcre2/src/pcre2_config.c
+src/libpcre2/src/pcre2_context.c
+src/libpcre2/src/pcre2_convert.c
+src/libpcre2/src/pcre2_dfa_match.c
+src/libpcre2/src/pcre2_dftables.c
+src/libpcre2/src/pcre2_error.c
+src/libpcre2/src/pcre2_extuni.c
+src/libpcre2/src/pcre2_find_bracket.c
+src/libpcre2/src/pcre2_fuzzsupport.c
+src/libpcre2/src/pcre2_internal.h
+src/libpcre2/src/pcre2_intmodedep.h
+src/libpcre2/src/pcre2_jit_compile.c
+src/libpcre2/src/pcre2_jit_match.c
+src/libpcre2/src/pcre2_jit_misc.c
+src/libpcre2/src/pcre2_jit_neon_inc.h
+src/libpcre2/src/pcre2_jit_simd_inc.h
+src/libpcre2/src/pcre2_jit_test.c
+src/libpcre2/src/pcre2_maketables.c
+src/libpcre2/src/pcre2_match.c
+src/libpcre2/src/pcre2_match_data.c
+src/libpcre2/src/pcre2_newline.c
+src/libpcre2/src/pcre2_ord2utf.c
+src/libpcre2/src/pcre2_pattern_info.c
+src/libpcre2/src/pcre2_printint.c
+src/libpcre2/src/pcre2_script_run.c
+src/libpcre2/src/pcre2_serialize.c
+src/libpcre2/src/pcre2_string_utils.c
+src/libpcre2/src/pcre2_study.c
+src/libpcre2/src/pcre2_substitute.c
+src/libpcre2/src/pcre2_substring.c
+src/libpcre2/src/pcre2_tables.c
+src/libpcre2/src/pcre2_ucd.c
+src/libpcre2/src/pcre2_ucp.h
+src/libpcre2/src/pcre2_ucptables.c
+src/libpcre2/src/pcre2_valid_utf.c
+src/libpcre2/src/pcre2_xclass.c
+src/libpcre2/src/pcre2demo.c
+src/libpcre2/src/pcre2grep.c
+src/libpcre2/src/pcre2posix.c
+src/libpcre2/src/pcre2posix.h
+src/libpcre2/src/pcre2posix_test.c
+src/libpcre2/src/pcre2test.c
+src/libpcre2/src/sljit/sljitConfig.h
+src/libpcre2/src/sljit/sljitConfigCPU.h
+src/libpcre2/src/sljit/sljitConfigInternal.h
+src/libpcre2/src/sljit/sljitLir.c
+src/libpcre2/src/sljit/sljitLir.h
+src/libpcre2/src/sljit/sljitNativeARM_32.c
+src/libpcre2/src/sljit/sljitNativeARM_64.c
+src/libpcre2/src/sljit/sljitNativeARM_T2_32.c
+src/libpcre2/src/sljit/sljitNativeLOONGARCH_64.c
+src/libpcre2/src/sljit/sljitNativeMIPS_32.c
+src/libpcre2/src/sljit/sljitNativeMIPS_64.c
+src/libpcre2/src/sljit/sljitNativeMIPS_common.c
+src/libpcre2/src/sljit/sljitNativePPC_32.c
+src/libpcre2/src/sljit/sljitNativePPC_64.c
+src/libpcre2/src/sljit/sljitNativePPC_common.c
+src/libpcre2/src/sljit/sljitNativeRISCV_32.c
+src/libpcre2/src/sljit/sljitNativeRISCV_64.c
+src/libpcre2/src/sljit/sljitNativeRISCV_common.c
+src/libpcre2/src/sljit/sljitNativeS390X.c
+src/libpcre2/src/sljit/sljitNativeX86_32.c
+src/libpcre2/src/sljit/sljitNativeX86_64.c
+src/libpcre2/src/sljit/sljitNativeX86_common.c
+src/libpcre2/src/sljit/sljitUtils.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorApple.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorCore.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorFreeBSD.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorPosix.c
+src/libpcre2/src/sljit/allocator_src/sljitExecAllocatorWindows.c
+src/libpcre2/src/sljit/allocator_src/sljitProtExecAllocatorNetBSD.c
+src/libpcre2/src/sljit/allocator_src/sljitProtExecAllocatorPosix.c
+src/libpcre2/src/sljit/allocator_src/sljitWXExecAllocatorPosix.c
+src/libpcre2/src/sljit/allocator_src/sljitWXExecAllocatorWindows.c
+src/libpcre2/testdata/grepbinary
+src/libpcre2/testdata/grepfilelist
+src/libpcre2/testdata/grepinput
+src/libpcre2/testdata/grepinput3
+src/libpcre2/testdata/grepinput8
+src/libpcre2/testdata/grepinputC.bz2
+src/libpcre2/testdata/grepinputC.gz
+src/libpcre2/testdata/grepinputM
+src/libpcre2/testdata/grepinputv
+src/libpcre2/testdata/grepinputx
+src/libpcre2/testdata/greplist
+src/libpcre2/testdata/grepnot.bz2
+src/libpcre2/testdata/grepoutput
+src/libpcre2/testdata/grepoutput8
+src/libpcre2/testdata/grepoutputC
+src/libpcre2/testdata/grepoutputCN
+src/libpcre2/testdata/grepoutputCNU
+src/libpcre2/testdata/grepoutputCU
+src/libpcre2/testdata/grepoutputCbz2
+src/libpcre2/testdata/grepoutputCgz
+src/libpcre2/testdata/grepoutputN
+src/libpcre2/testdata/grepoutputUN
+src/libpcre2/testdata/greppatN4
+src/libpcre2/testdata/testbtables
+src/libpcre2/testdata/testinput1
+src/libpcre2/testdata/testinput10
+src/libpcre2/testdata/testinput11
+src/libpcre2/testdata/testinput12
+src/libpcre2/testdata/testinput13
+src/libpcre2/testdata/testinput14
+src/libpcre2/testdata/testinput15
+src/libpcre2/testdata/testinput16
+src/libpcre2/testdata/testinput17
+src/libpcre2/testdata/testinput18
+src/libpcre2/testdata/testinput19
+src/libpcre2/testdata/testinput2
+src/libpcre2/testdata/testinput20
+src/libpcre2/testdata/testinput21
+src/libpcre2/testdata/testinput22
+src/libpcre2/testdata/testinput23
+src/libpcre2/testdata/testinput24
+src/libpcre2/testdata/testinput25
+src/libpcre2/testdata/testinput26
+src/libpcre2/testdata/testinput3
+src/libpcre2/testdata/testinput4
+src/libpcre2/testdata/testinput5
+src/libpcre2/testdata/testinput6
+src/libpcre2/testdata/testinput7
+src/libpcre2/testdata/testinput8
+src/libpcre2/testdata/testinput9
+src/libpcre2/testdata/testinputEBC
+src/libpcre2/testdata/testinputheap
+src/libpcre2/testdata/testoutput1
+src/libpcre2/testdata/testoutput10
+src/libpcre2/testdata/testoutput11-16
+src/libpcre2/testdata/testoutput11-32
+src/libpcre2/testdata/testoutput12-16
+src/libpcre2/testdata/testoutput12-32
+src/libpcre2/testdata/testoutput13
+src/libpcre2/testdata/testoutput14-16
+src/libpcre2/testdata/testoutput14-32
+src/libpcre2/testdata/testoutput14-8
+src/libpcre2/testdata/testoutput15
+src/libpcre2/testdata/testoutput16
+src/libpcre2/testdata/testoutput17
+src/libpcre2/testdata/testoutput18
+src/libpcre2/testdata/testoutput19
+src/libpcre2/testdata/testoutput2
+src/libpcre2/testdata/testoutput20
+src/libpcre2/testdata/testoutput21
+src/libpcre2/testdata/testoutput22-16
+src/libpcre2/testdata/testoutput22-32
+src/libpcre2/testdata/testoutput22-8
+src/libpcre2/testdata/testoutput23
+src/libpcre2/testdata/testoutput24
+src/libpcre2/testdata/testoutput25
+src/libpcre2/testdata/testoutput26
+src/libpcre2/testdata/testoutput3
+src/libpcre2/testdata/testoutput3A
+src/libpcre2/testdata/testoutput3B
+src/libpcre2/testdata/testoutput4
+src/libpcre2/testdata/testoutput5
+src/libpcre2/testdata/testoutput6
+src/libpcre2/testdata/testoutput7
+src/libpcre2/testdata/testoutput8-16-2
+src/libpcre2/testdata/testoutput8-16-3
+src/libpcre2/testdata/testoutput8-16-4
+src/libpcre2/testdata/testoutput8-32-2
+src/libpcre2/testdata/testoutput8-32-3
+src/libpcre2/testdata/testoutput8-32-4
+src/libpcre2/testdata/testoutput8-8-2
+src/libpcre2/testdata/testoutput8-8-3
+src/libpcre2/testdata/testoutput8-8-4
+src/libpcre2/testdata/testoutput9
+src/libpcre2/testdata/testoutputEBC
+src/libpcre2/testdata/testoutputheap-16
+src/libpcre2/testdata/testoutputheap-32
+src/libpcre2/testdata/testoutputheap-8
+src/libpcre2/testdata/valgrind-jit.supp
+src/libpcre2/testdata/wintestinput3
+src/libpcre2/testdata/wintestoutput3
+src/pcre2/CMakeLists.txt
+src/pcre2/__init__.py
+src/pcre2/consts.pxd
+src/pcre2/consts.pyx
+src/pcre2/exceptions.pxd
+src/pcre2/exceptions.pyx
+src/pcre2/libpcre2.pxd
+src/pcre2/match.pxd
+src/pcre2/match.pyx
+src/pcre2/methods.pxd
+src/pcre2/methods.pyx
+src/pcre2/pattern.pxd
+src/pcre2/pattern.pyx
+src/pcre2/scanner.pxd
+src/pcre2/scanner.pyx
+src/pcre2/utils.pxd
+src/pcre2/utils.pyx
+src/pcre2.egg-info/PKG-INFO
+src/pcre2.egg-info/SOURCES.txt
+src/pcre2.egg-info/dependency_links.txt
+src/pcre2.egg-info/top_level.txt
+tests/test_groups.py
+tests/test_match.py
+tests/test_pattern.py
\ No newline at end of file
diff --git a/src/pcre2.egg-info/dependency_links.txt b/src/pcre2.egg-info/dependency_links.txt
new file mode 100644 (file)
index 0000000..8b13789
--- /dev/null
@@ -0,0 +1 @@
+
diff --git a/src/pcre2.egg-info/top_level.txt b/src/pcre2.egg-info/top_level.txt
new file mode 100644 (file)
index 0000000..92d5e6d
--- /dev/null
@@ -0,0 +1 @@
+pcre2
diff --git a/src/pcre2/CMakeLists.txt b/src/pcre2/CMakeLists.txt
new file mode 100644 (file)
index 0000000..38c117e
--- /dev/null
@@ -0,0 +1,53 @@
+find_package(Cython MODULE REQUIRED)
+find_package(PythonExtensions MODULE REQUIRED)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# Build Cython with annotations.
+set(CYTHON_ANNOTATE TRUE)
+
+# Macro to add Cython files as modules, configured to build with PCRE2.
+macro(add_pyx_file filename)
+    add_cython_target(${filename} C PY3)
+    add_library(${filename} MODULE ${filename})
+    python_extension_module(${filename})
+
+    target_link_libraries(${filename} pcre2-8-static)
+    target_include_directories(${filename} PRIVATE ${PCRE2_INCLUDE_DIR})
+    target_compile_options(${filename} PRIVATE ${CYTHON_EXTRA_COMPILE_ARGS})
+    
+    install(TARGETS ${filename} LIBRARY DESTINATION src/pcre2)
+endmacro()
+
+# GLOB pattern is recommended against,
+# https://cmake.org/cmake/help/v3.14/command/file.html?highlight=file#filesystem
+add_pyx_file(consts)
+add_pyx_file(exceptions)
+add_pyx_file(match)
+add_pyx_file(methods)
+add_pyx_file(pattern)
+add_pyx_file(scanner)
+add_pyx_file(utils)
+
+
+# Include .pyx and .pxd files in distribution for use by Cython API.
+install(
+    FILES
+        consts.pxd
+        consts.pyx
+        exceptions.pxd
+        exceptions.pyx
+        libpcre2.pxd
+        match.pxd
+        match.pyx
+        methods.pxd
+        methods.pyx
+        pattern.pxd
+        pattern.pyx
+        scanner.pxd
+        scanner.pyx
+        utils.pxd
+        utils.pyx
+    DESTINATION
+        src/pcre2
+)
\ No newline at end of file
diff --git a/src/pcre2/__init__.py b/src/pcre2/__init__.py
new file mode 100755 (executable)
index 0000000..e08e5b5
--- /dev/null
@@ -0,0 +1,7 @@
+from .methods import compile, findall, match, scan, split, substitute
+from .consts import (
+    __libpcre2_version__,
+    CompileOption,
+    A, I, M, U, S, X
+)
+__version__ = "0.4.0"
diff --git a/src/pcre2/consts.pxd b/src/pcre2/consts.pxd
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/pcre2/consts.pyx b/src/pcre2/consts.pyx
new file mode 100644 (file)
index 0000000..0970dde
--- /dev/null
@@ -0,0 +1,89 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from enum import IntEnum
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+
+
+__libpcre2_version__ = f"{PCRE2_MAJOR}.{PCRE2_MINOR}"
+
+
+class MetaOption(IntEnum):
+    def __repr__(self):
+        return f"<{self.__class__.__name__}.{self._name_}: 0x{self._value_:x}>"
+
+    @classmethod
+    def verify(cls, options):
+        """ Verify a number is composed of options.
+        """
+        tmp = options
+        for opt in cls:
+            tmp ^= (opt & tmp)
+        return tmp == 0
+
+
+    @classmethod
+    def decompose(cls, options):
+        """ Decompose a number into its component options, returning a list of
+        MetaOption enums that are components of the given options. Note that
+        left over bits are ignored, and veracity can not be determined from
+        the result.
+        """
+        return [opt for opt in cls if (opt & options)]
+
+
+class CompileOption(MetaOption):
+    """ Option bits to be used in pattern compilation. See the following PCRE2
+    documentation for a brief overview of the relevant options:
+    http://pcre.org/current/doc/html/pcre2_compile.html
+    """
+
+    ALLOW_EMPTY_CLASS = PCRE2_ALLOW_EMPTY_CLASS
+    ALT_BSUX = PCRE2_ALT_BSUX
+    ALT_CIRCUMFLEX = PCRE2_ALT_CIRCUMFLEX
+    ALT_VERBNAMES = PCRE2_ALT_VERBNAMES
+    ANCHORED = PCRE2_ANCHORED
+    CASELESS = PCRE2_CASELESS
+    DOLLAR_ENDONLY = PCRE2_DOLLAR_ENDONLY
+    DOTALL = PCRE2_DOTALL
+    DUPNAMES = PCRE2_DUPNAMES
+    ENDANCHORED = PCRE2_ENDANCHORED
+    EXTENDED = PCRE2_EXTENDED
+    EXTENDED_MORE = PCRE2_EXTENDED_MORE
+    FIRSTLINE = PCRE2_FIRSTLINE
+    LITERAL = PCRE2_LITERAL
+    MATCH_UNSET_BACKREF = PCRE2_MATCH_UNSET_BACKREF
+    MULTILINE = PCRE2_MULTILINE
+    UCP = PCRE2_UCP
+    UNGREEDY = PCRE2_UNGREEDY
+    UTF = PCRE2_UTF
+
+
+class BsrChar(IntEnum):
+    """ Indicator for what character(s) are denoted by `\r`.
+    """
+    UNICODE = PCRE2_BSR_UNICODE
+    ANYCRLF = PCRE2_BSR_ANYCRLF
+
+
+class NewlineChar(IntEnum):
+    """ Indicator for what character(s) denote a newline.
+    """
+    CR = PCRE2_NEWLINE_CR
+    LF = PCRE2_NEWLINE_LF
+    CRLF = PCRE2_NEWLINE_CRLF
+    ANY = PCRE2_NEWLINE_ANY
+    ANYCRLF = PCRE2_NEWLINE_ANYCRLF
+    NUL = PCRE2_NEWLINE_NUL
+
+
+# Shorthands
+A = CompileOption.ANCHORED
+I = CompileOption.CASELESS
+M = CompileOption.MULTILINE
+U = CompileOption.UTF
+S = CompileOption.DOTALL
+X = CompileOption.EXTENDED
diff --git a/src/pcre2/exceptions.pxd b/src/pcre2/exceptions.pxd
new file mode 100755 (executable)
index 0000000..e69de29
diff --git a/src/pcre2/exceptions.pyx b/src/pcre2/exceptions.pyx
new file mode 100755 (executable)
index 0000000..f0ce9e3
--- /dev/null
@@ -0,0 +1,63 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint8_t
+
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+
+
+class LibraryError(Exception):
+    """ Catch all for other PCRE2 errors (e.g. bad option bits).
+    """
+
+    def __init__(self, errorcode, context_msg=""):
+        cdef uint8_t errormsg_buf[120]
+        get_error_message_rc = pcre2_get_error_message(
+            errorcode, 
+            errormsg_buf, sizeof(errormsg_buf)
+        )
+
+        # Handle errors in fetching error message.
+        if get_error_message_rc == PCRE2_ERROR_NOMEMORY:
+            raise MemoryError()
+        elif get_error_message_rc < 0:
+            raise LibraryError(
+                get_error_message_rc,
+                context_msg=f"Could not retrieve message for error code {get_error_message_rc}."
+            )
+
+        msg = errormsg_buf.decode("utf-8").capitalize()
+        if context_msg:
+            msg = context_msg + ". " + msg
+
+        super().__init__(msg)
+        self.errorcode = errorcode
+
+
+class CompileError(LibraryError):
+    """ Raised when pattern is malformed or is otherwise unable to be
+    compiled.
+    """
+    
+    def __init__(self, errorcode, context_msg=""):
+        if not (errorcode > 0):
+            raise ValueError("Compilation error codes are strictly positive")
+        
+        super().__init__(errorcode, context_msg=context_msg)
+
+
+class MatchError(LibraryError):
+    """ Raised when no or partial match found.
+    """
+    
+    def __init__(self, errorcode, context_msg=""):
+        if not (errorcode == PCRE2_ERROR_NOMATCH or errorcode == PCRE2_ERROR_PARTIAL):
+            raise ValueError(
+                f"Invalid error code '{errorcode}'. "
+                "Match error codes can only be of value PCRE2_ERROR_NOMATCH or PCRE2_ERROR_PARTIAL"
+            )
+        
+        super().__init__(errorcode, context_msg=context_msg)
diff --git a/src/pcre2/libpcre2.pxd b/src/pcre2/libpcre2.pxd
new file mode 100755 (executable)
index 0000000..1d95e9d
--- /dev/null
@@ -0,0 +1,501 @@
+# -*- coding:utf-8 -*-
+
+from libc.stdint cimport uint8_t, uint32_t, int32_t
+
+
+cdef extern from "pcre2.h":
+    cdef unsigned int PCRE2_MAJOR
+    cdef unsigned int PCRE2_MINOR
+
+    # The following option bits can be passed to pcre2_compile(),
+    # pcre2_match(), or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the
+    # function to which it is passed. Put these bits at the most significant
+    # end of the options word so others can be added next to them.
+    cdef unsigned int PCRE2_ANCHORED
+    cdef unsigned int PCRE2_NO_UTF_CHECK
+    cdef unsigned int PCRE2_ENDANCHORED
+
+    # The following option bits can be passed only to pcre2_compile(). However,
+    # they may affect compilation, JIT compilation, and/or interpretive
+    # execution. The following tags indicate which:
+    # C   alters what is compiled by pcre2_compile()
+    # J   alters what is compiled by pcre2_jit_compile()
+    # M   is inspected during pcre2_match() execution
+    # D   is inspected during pcre2_dfa_match() execution
+    cdef unsigned int PCRE2_ALLOW_EMPTY_CLASS    # C       
+    cdef unsigned int PCRE2_ALT_BSUX             # C       
+    cdef unsigned int PCRE2_AUTO_CALLOUT         # C       
+    cdef unsigned int PCRE2_CASELESS             # C       
+    cdef unsigned int PCRE2_DOLLAR_ENDONLY       #   J M D 
+    cdef unsigned int PCRE2_DOTALL               # C       
+    cdef unsigned int PCRE2_DUPNAMES             # C       
+    cdef unsigned int PCRE2_EXTENDED             # C       
+    cdef unsigned int PCRE2_FIRSTLINE            #   J M D 
+    cdef unsigned int PCRE2_MATCH_UNSET_BACKREF  # C J M   
+    cdef unsigned int PCRE2_MULTILINE            # C       
+    cdef unsigned int PCRE2_NEVER_UCP            # C       
+    cdef unsigned int PCRE2_NEVER_UTF            # C       
+    cdef unsigned int PCRE2_NO_AUTO_CAPTURE      # C       
+    cdef unsigned int PCRE2_NO_AUTO_POSSESS      # C       
+    cdef unsigned int PCRE2_NO_DOTSTAR_ANCHOR    # C       
+    cdef unsigned int PCRE2_NO_START_OPTIMIZE    #   J M D 
+    cdef unsigned int PCRE2_UCP                  # C J M D 
+    cdef unsigned int PCRE2_UNGREEDY             # C       
+    cdef unsigned int PCRE2_UTF                  # C J M D 
+    cdef unsigned int PCRE2_NEVER_BACKSLASH_C    # C       
+    cdef unsigned int PCRE2_ALT_CIRCUMFLEX       #   J M D 
+    cdef unsigned int PCRE2_ALT_VERBNAMES        # C       
+    cdef unsigned int PCRE2_USE_OFFSET_LIMIT     #   J M D 
+    cdef unsigned int PCRE2_EXTENDED_MORE        # C       
+    cdef unsigned int PCRE2_LITERAL              # C       
+    cdef unsigned int PCRE2_MATCH_INVALID_UTF    #   J M D
+
+    # An additional compile options word is available in the compile context. 
+    cdef unsigned int PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  # C 
+    cdef unsigned int PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    # C 
+    cdef unsigned int PCRE2_EXTRA_MATCH_WORD               # C 
+    cdef unsigned int PCRE2_EXTRA_MATCH_LINE               # C 
+    cdef unsigned int PCRE2_EXTRA_ESCAPED_CR_IS_LF         # C 
+    cdef unsigned int PCRE2_EXTRA_ALT_BSUX                 # C 
+    cdef unsigned int PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     # C 
+
+    # These are for pcre2_jit_compile(). 
+    cdef unsigned int PCRE2_JIT_COMPLETE  # For full matching.
+    cdef unsigned int PCRE2_JIT_PARTIAL_SOFT
+    cdef unsigned int PCRE2_JIT_PARTIAL_HARD
+    cdef unsigned int PCRE2_JIT_INVALID_UTF
+
+    # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and
+    # pcre2_substitute(). Some are allowed only for one of the functions, and
+    # in these cases it is noted below. Note that PCRE2_ANCHORED,
+    # PCRE2_ENDANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these
+    # functions (though pcre2_jit_match() ignores the latter since it bypasses
+    # all sanity checks).
+    cdef unsigned int PCRE2_NOTBOL
+    cdef unsigned int PCRE2_NOTEOL
+    cdef unsigned int PCRE2_NOTEMPTY          # ) These two must be kept
+    cdef unsigned int PCRE2_NOTEMPTY_ATSTART  # ) adjacent to each other. 
+    cdef unsigned int PCRE2_PARTIAL_SOFT
+    cdef unsigned int PCRE2_PARTIAL_HARD
+    cdef unsigned int PCRE2_DFA_RESTART  # pcre2_dfa_match() only 
+    cdef unsigned int PCRE2_DFA_SHORTEST  # pcre2_dfa_match() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_GLOBAL  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_EXTENDED  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_UNSET_EMPTY  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_UNKNOWN_UNSET  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_OVERFLOW_LENGTH  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_NO_JIT  # Not for pcre2_dfa_match() 
+    cdef unsigned int PCRE2_COPY_MATCHED_SUBJECT
+    cdef unsigned int PCRE2_SUBSTITUTE_LITERAL  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_MATCHED  # pcre2_substitute() only 
+    cdef unsigned int PCRE2_SUBSTITUTE_REPLACEMENT_ONLY  # pcre2_substitute() only 
+
+    # Options for pcre2_pattern_convert(). 
+    cdef unsigned int PCRE2_CONVERT_UTF
+    cdef unsigned int PCRE2_CONVERT_NO_UTF_CHECK
+    cdef unsigned int PCRE2_CONVERT_POSIX_BASIC
+    cdef unsigned int PCRE2_CONVERT_POSIX_EXTENDED
+    cdef unsigned int PCRE2_CONVERT_GLOB
+    cdef unsigned int PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
+    cdef unsigned int PCRE2_CONVERT_GLOB_NO_STARSTAR
+
+    # Newline and \R settings, for use in compile contexts. The newline values
+    # must be kept in step with values set in config.h and both sets must all
+    # be greater than zero.
+    cdef int PCRE2_NEWLINE_CR
+    cdef int PCRE2_NEWLINE_LF
+    cdef int PCRE2_NEWLINE_CRLF
+    cdef int PCRE2_NEWLINE_ANY
+    cdef int PCRE2_NEWLINE_ANYCRLF
+    cdef int PCRE2_NEWLINE_NUL
+
+    cdef int PCRE2_BSR_UNICODE
+    cdef int PCRE2_BSR_ANYCRLF
+
+    # Error codes for pcre2_compile(). Some of these are also used by
+    # pcre2_pattern_convert().
+    cdef int PCRE2_ERROR_END_BACKSLASH
+    cdef int PCRE2_ERROR_END_BACKSLASH_C
+    cdef int PCRE2_ERROR_UNKNOWN_ESCAPE
+    cdef int PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER
+    cdef int PCRE2_ERROR_QUANTIFIER_TOO_BIG
+    cdef int PCRE2_ERROR_MISSING_SQUARE_BRACKET
+    cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS
+    cdef int PCRE2_ERROR_CLASS_RANGE_ORDER
+    cdef int PCRE2_ERROR_QUANTIFIER_INVALID
+    cdef int PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT
+    cdef int PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY
+    cdef int PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS
+    cdef int PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING
+    cdef int PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS
+    cdef int PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE
+    cdef int PCRE2_ERROR_NULL_PATTERN
+    cdef int PCRE2_ERROR_BAD_OPTIONS
+    cdef int PCRE2_ERROR_MISSING_COMMENT_CLOSING
+    cdef int PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP
+    cdef int PCRE2_ERROR_PATTERN_TOO_LARGE
+    cdef int PCRE2_ERROR_HEAP_FAILED
+    cdef int PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS
+    cdef int PCRE2_ERROR_INTERNAL_CODE_OVERFLOW
+    cdef int PCRE2_ERROR_MISSING_CONDITION_CLOSING
+    cdef int PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH
+    cdef int PCRE2_ERROR_ZERO_RELATIVE_REFERENCE
+    cdef int PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES
+    cdef int PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED
+    cdef int PCRE2_ERROR_BAD_RELATIVE_REFERENCE
+    cdef int PCRE2_ERROR_UNKNOWN_POSIX_CLASS
+    cdef int PCRE2_ERROR_INTERNAL_STUDY_ERROR
+    cdef int PCRE2_ERROR_UNICODE_NOT_SUPPORTED
+    cdef int PCRE2_ERROR_PARENTHESES_STACK_CHECK
+    cdef int PCRE2_ERROR_CODE_POINT_TOO_BIG
+    cdef int PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED
+    cdef int PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C
+    cdef int PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE
+    cdef int PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG
+    cdef int PCRE2_ERROR_MISSING_CALLOUT_CLOSING
+    cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_VERB
+    cdef int PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P
+    cdef int PCRE2_ERROR_MISSING_NAME_TERMINATOR
+    cdef int PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME
+    cdef int PCRE2_ERROR_INVALID_SUBPATTERN_NAME
+    cdef int PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE
+    cdef int PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY
+    cdef int PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY
+    cdef int PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG
+    cdef int PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS
+    cdef int PCRE2_ERROR_CLASS_INVALID_RANGE
+    cdef int PCRE2_ERROR_OCTAL_BYTE_TOO_BIG
+    cdef int PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE
+    cdef int PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN
+    cdef int PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES
+    cdef int PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE
+    cdef int PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE
+    cdef int PCRE2_ERROR_BACKSLASH_G_SYNTAX
+    cdef int PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING
+    # Error 159 is obsolete and should now never occur 
+    cdef int PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED
+    cdef int PCRE2_ERROR_VERB_UNKNOWN
+    cdef int PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG
+    cdef int PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED
+    cdef int PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW
+    cdef int PCRE2_ERROR_INVALID_OCTAL
+    cdef int PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH
+    cdef int PCRE2_ERROR_MARK_MISSING_ARGUMENT
+    cdef int PCRE2_ERROR_INVALID_HEXADECIMAL
+    cdef int PCRE2_ERROR_BACKSLASH_C_SYNTAX
+    cdef int PCRE2_ERROR_BACKSLASH_K_SYNTAX
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS
+    cdef int PCRE2_ERROR_BACKSLASH_N_IN_CLASS
+    cdef int PCRE2_ERROR_CALLOUT_STRING_TOO_LONG
+    cdef int PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT
+    cdef int PCRE2_ERROR_UTF_IS_DISABLED
+    cdef int PCRE2_ERROR_UCP_IS_DISABLED
+    cdef int PCRE2_ERROR_VERB_NAME_TOO_LONG
+    cdef int PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG
+    cdef int PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS
+    cdef int PCRE2_ERROR_VERSION_CONDITION_SYNTAX
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS
+    cdef int PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER
+    cdef int PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER
+    cdef int PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED
+    cdef int PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP
+    cdef int PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED
+    cdef int PCRE2_ERROR_PATTERN_TOO_COMPLICATED
+    cdef int PCRE2_ERROR_LOOKBEHIND_TOO_LONG
+    cdef int PCRE2_ERROR_PATTERN_STRING_TOO_LONG
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE
+    cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP
+    cdef int PCRE2_ERROR_NO_SURROGATES_IN_UTF16
+    cdef int PCRE2_ERROR_BAD_LITERAL_OPTIONS
+    cdef int PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE
+    cdef int PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS
+    cdef int PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN
+    cdef int PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE
+    cdef int PCRE2_ERROR_TOO_MANY_CAPTURES
+    cdef int PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED
+    cdef int PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND
+
+    # "Expected" matching error codes: no match and partial match. 
+    cdef int PCRE2_ERROR_NOMATCH
+    cdef int PCRE2_ERROR_PARTIAL
+
+    # Error codes for UTF-8 validity checks.
+    cdef int PCRE2_ERROR_UTF8_ERR1
+    cdef int PCRE2_ERROR_UTF8_ERR2
+    cdef int PCRE2_ERROR_UTF8_ERR3
+    cdef int PCRE2_ERROR_UTF8_ERR4
+    cdef int PCRE2_ERROR_UTF8_ERR5
+    cdef int PCRE2_ERROR_UTF8_ERR6
+    cdef int PCRE2_ERROR_UTF8_ERR7
+    cdef int PCRE2_ERROR_UTF8_ERR8
+    cdef int PCRE2_ERROR_UTF8_ERR9
+    cdef int PCRE2_ERROR_UTF8_ERR10
+    cdef int PCRE2_ERROR_UTF8_ERR11
+    cdef int PCRE2_ERROR_UTF8_ERR12
+    cdef int PCRE2_ERROR_UTF8_ERR13
+    cdef int PCRE2_ERROR_UTF8_ERR14
+    cdef int PCRE2_ERROR_UTF8_ERR15
+    cdef int PCRE2_ERROR_UTF8_ERR16
+    cdef int PCRE2_ERROR_UTF8_ERR17
+    cdef int PCRE2_ERROR_UTF8_ERR18
+    cdef int PCRE2_ERROR_UTF8_ERR19
+    cdef int PCRE2_ERROR_UTF8_ERR20
+    cdef int PCRE2_ERROR_UTF8_ERR21
+
+    # Error codes for UTF-16 validity checks. 
+    cdef int PCRE2_ERROR_UTF16_ERR1
+    cdef int PCRE2_ERROR_UTF16_ERR2
+    cdef int PCRE2_ERROR_UTF16_ERR3
+
+    # Error codes for UTF-32 validity checks.
+    cdef int PCRE2_ERROR_UTF32_ERR1
+    cdef int PCRE2_ERROR_UTF32_ERR2
+
+    # Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction
+    # functions, context functions, and serializing functions. They are in
+    # numerical order. Originally they were in alphabetical order too, but now
+    # that PCRE2 is released, the numbers must not be changed.
+    cdef int PCRE2_ERROR_BADDATA
+    cdef int PCRE2_ERROR_MIXEDTABLES  # Name was changed.
+    cdef int PCRE2_ERROR_BADMAGIC
+    cdef int PCRE2_ERROR_BADMODE
+    cdef int PCRE2_ERROR_BADOFFSET
+    cdef int PCRE2_ERROR_BADOPTION
+    cdef int PCRE2_ERROR_BADREPLACEMENT
+    cdef int PCRE2_ERROR_BADUTFOFFSET
+    cdef int PCRE2_ERROR_CALLOUT  # Never used by PCRE2 itself.
+    cdef int PCRE2_ERROR_DFA_BADRESTART
+    cdef int PCRE2_ERROR_DFA_RECURSE
+    cdef int PCRE2_ERROR_DFA_UCOND
+    cdef int PCRE2_ERROR_DFA_UFUNC
+    cdef int PCRE2_ERROR_DFA_UITEM
+    cdef int PCRE2_ERROR_DFA_WSSIZE
+    cdef int PCRE2_ERROR_INTERNAL
+    cdef int PCRE2_ERROR_JIT_BADOPTION
+    cdef int PCRE2_ERROR_JIT_STACKLIMIT
+    cdef int PCRE2_ERROR_MATCHLIMIT
+    cdef int PCRE2_ERROR_NOMEMORY
+    cdef int PCRE2_ERROR_NOSUBSTRING
+    cdef int PCRE2_ERROR_NOUNIQUESUBSTRING
+    cdef int PCRE2_ERROR_NULL
+    cdef int PCRE2_ERROR_RECURSELOOP
+    cdef int PCRE2_ERROR_DEPTHLIMIT
+    cdef int PCRE2_ERROR_RECURSIONLIMIT  # Obsolete synonym. 
+    cdef int PCRE2_ERROR_UNAVAILABLE
+    cdef int PCRE2_ERROR_UNSET
+    cdef int PCRE2_ERROR_BADOFFSETLIMIT
+    cdef int PCRE2_ERROR_BADREPESCAPE
+    cdef int PCRE2_ERROR_REPMISSINGBRACE
+    cdef int PCRE2_ERROR_BADSUBSTITUTION
+    cdef int PCRE2_ERROR_BADSUBSPATTERN
+    cdef int PCRE2_ERROR_TOOMANYREPLACE
+    cdef int PCRE2_ERROR_BADSERIALIZEDDATA
+    cdef int PCRE2_ERROR_HEAPLIMIT
+    cdef int PCRE2_ERROR_CONVERT_SYNTAX
+    cdef int PCRE2_ERROR_INTERNAL_DUPMATCH
+    cdef int PCRE2_ERROR_DFA_UINVALID_UTF
+
+    # Request types for pcre2_pattern_info().
+    cdef int PCRE2_INFO_ALLOPTIONS
+    cdef int PCRE2_INFO_ARGOPTIONS
+    cdef int PCRE2_INFO_BACKREFMAX
+    cdef int PCRE2_INFO_BSR
+    cdef int PCRE2_INFO_CAPTURECOUNT
+    cdef int PCRE2_INFO_FIRSTCODEUNIT
+    cdef int PCRE2_INFO_FIRSTCODETYPE
+    cdef int PCRE2_INFO_FIRSTBITMAP
+    cdef int PCRE2_INFO_HASCRORLF
+    cdef int PCRE2_INFO_JCHANGED
+    cdef int PCRE2_INFO_JITSIZE
+    cdef int PCRE2_INFO_LASTCODEUNIT
+    cdef int PCRE2_INFO_LASTCODETYPE
+    cdef int PCRE2_INFO_MATCHEMPTY
+    cdef int PCRE2_INFO_MATCHLIMIT
+    cdef int PCRE2_INFO_MAXLOOKBEHIND
+    cdef int PCRE2_INFO_MINLENGTH
+    cdef int PCRE2_INFO_NAMECOUNT
+    cdef int PCRE2_INFO_NAMEENTRYSIZE
+    cdef int PCRE2_INFO_NAMETABLE
+    cdef int PCRE2_INFO_NEWLINE
+    cdef int PCRE2_INFO_DEPTHLIMIT
+    cdef int PCRE2_INFO_RECURSIONLIMIT  # Obsolete synonym 
+    cdef int PCRE2_INFO_SIZE
+    cdef int PCRE2_INFO_HASBACKSLASHC
+    cdef int PCRE2_INFO_FRAMESIZE
+    cdef int PCRE2_INFO_HEAPLIMIT
+    cdef int PCRE2_INFO_EXTRAOPTIONS
+
+    # Request types for pcre2_config(). 
+    cdef int PCRE2_CONFIG_BSR
+    cdef int PCRE2_CONFIG_JIT
+    cdef int PCRE2_CONFIG_JITTARGET
+    cdef int PCRE2_CONFIG_LINKSIZE
+    cdef int PCRE2_CONFIG_MATCHLIMIT
+    cdef int PCRE2_CONFIG_NEWLINE
+    cdef int PCRE2_CONFIG_PARENSLIMIT
+    cdef int PCRE2_CONFIG_DEPTHLIMIT
+    cdef int PCRE2_CONFIG_RECURSIONLIMIT  # Obsolete synonym 
+    cdef int PCRE2_CONFIG_STACKRECURSE  # Obsolete 
+    cdef int PCRE2_CONFIG_UNICODE
+    cdef int PCRE2_CONFIG_UNICODE_VERSION
+    cdef int PCRE2_CONFIG_VERSION
+    cdef int PCRE2_CONFIG_HEAPLIMIT
+    cdef int PCRE2_CONFIG_NEVER_BACKSLASH_C
+    cdef int PCRE2_CONFIG_COMPILED_WIDTHS
+    cdef int PCRE2_CONFIG_TABLES_LENGTH
+
+
+    # Opaque handles for PCRE2 defined structs.
+    ctypedef struct pcre2_code_t "pcre2_code":
+        pass
+    ctypedef struct pcre2_match_data_t "pcre2_match_data":
+        pass
+    ctypedef struct pcre2_general_context_t "pcre2_general_context":
+        pass
+    ctypedef struct pcre2_compile_context_t "pcre2_compile_context":
+        pass
+    ctypedef struct pcre2_match_context_t "pcre2_match_context":
+        pass
+
+    # Basic string definition. Note that this assumes PCRE2 in compiled to
+    # support 8-bit strings.
+    ctypedef const uint8_t *pcre2_sptr_t "PCRE2_SPTR"
+
+    
+    # Error handling functions.
+    int pcre2_get_error_message(
+        int errorcode,
+        uint8_t *buffer,
+        size_t bufflen
+    )
+
+    # Pattern compilation functions.
+    pcre2_code_t * pcre2_compile(
+        pcre2_sptr_t pattern, 
+        size_t length,
+        uint32_t options,
+        int *errorcode,
+        size_t *erroroffset,
+        pcre2_compile_context_t *ccontext
+    )
+
+    int pcre2_jit_compile(
+        pcre2_code_t *code,
+        uint32_t options
+    )
+
+
+    void pcre2_code_free(pcre2_code_t *code)
+
+    # Information on compiled pattern.
+    int pcre2_pattern_info(
+        const pcre2_code_t *code,
+        uint32_t what,
+        void *where
+    )
+
+    int pcre2_substring_number_from_name(
+        const pcre2_code_t *code,
+        pcre2_sptr_t name
+    )
+    
+    # Matching and match data functions.
+    pcre2_match_data_t * pcre2_match_data_create(
+        uint32_t ovecsize,
+        pcre2_general_context_t *gcontext
+    )
+    
+    pcre2_match_data_t * pcre2_match_data_create_from_pattern(
+        const pcre2_code_t *code,
+        pcre2_general_context_t *gcontext
+    )
+    
+    int pcre2_match(
+        const pcre2_code_t *code,
+        pcre2_sptr_t subject,
+        size_t length,
+        size_t startoffset,
+        uint32_t options,
+        pcre2_match_data_t *match_data,
+        pcre2_match_context_t *mcontext
+    )
+    int pcre2_jit_match(
+        const pcre2_code_t *code,
+        pcre2_sptr_t subject,
+        size_t length,
+        size_t startoffset,
+        uint32_t options,
+        pcre2_match_data_t *match_data,
+        pcre2_match_context_t *mcontext
+    )
+    
+    void pcre2_match_data_free(pcre2_match_data_t *match_data)
+
+    uint32_t pcre2_get_ovector_count(pcre2_match_data_t *match_data)
+
+    size_t *pcre2_get_ovector_pointer(pcre2_match_data_t *match_data)
+
+    int pcre2_substring_nametable_scan(
+        const pcre2_code_t *code,
+        pcre2_sptr_t name,
+        pcre2_sptr_t *first,
+        pcre2_sptr_t *last
+    )
+
+    # String extraction from match data blocks.
+    int pcre2_substring_length_byname(
+        pcre2_match_data_t *match_data,
+        pcre2_sptr_t name,
+        size_t *bufflen
+    )
+
+    int pcre2_substring_get_byname(
+        pcre2_match_data_t *match_data,
+        pcre2_sptr_t name, 
+        uint8_t **bufferptr,
+        size_t *bufflen
+    )
+
+    int pcre2_substring_length_bynumber(
+        pcre2_match_data_t *match_data,
+        uint32_t number,
+        size_t *bufflen
+    )
+
+    int pcre2_substring_get_bynumber(
+        pcre2_match_data_t *match_data,
+        uint32_t number,
+        uint8_t **bufferptr,
+        size_t *bufflen
+    )
+
+    # Substitution.
+    int pcre2_substitute(
+        const pcre2_code_t *code,
+        pcre2_sptr_t subject,
+        size_t length,
+        size_t startoffset,
+        uint32_t options,
+        pcre2_match_data_t *match_data,
+        pcre2_match_context_t *mcontext,
+        pcre2_sptr_t replacement,
+        size_t rlength,
+        uint8_t *outputbuffer,
+        size_t *outlengthptr
+    )
+
+    # Serialization.
+    int32_t pcre2_serialize_decode(
+        pcre2_code_t **codes,
+        int32_t number_of_codes,
+        const uint8_t *code_bytes,
+        pcre2_general_context_t *gcontex
+    )
+    int32_t pcre2_serialize_encode(
+        pcre2_code_t **codes,
+        int32_t number_of_codes,
+        uint8_t **serialized_bytes,
+        size_t *serialized_size,
+        pcre2_general_context_t *gcontex
+    )
+    void pcre2_serialize_free(uint8_t *bytes)
diff --git a/src/pcre2/match.pxd b/src/pcre2/match.pxd
new file mode 100644 (file)
index 0000000..76a61e2
--- /dev/null
@@ -0,0 +1,22 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+from libc.stdint cimport uint32_t
+
+# Local imports.
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+
+
+cdef class Match:
+    cdef pcre2_match_data_t *_mtch
+    cdef Pattern _pattern
+    cdef Py_buffer *_subj
+    cdef size_t _ofst # Byte offset, regardless of subject type.
+    cdef uint32_t _opts
+
+    @staticmethod
+    cdef Match _from_data(
+        pcre2_match_data_t *mtch, Pattern pattern, Py_buffer *subj, size_t ofst, uint32_t opts
+    )
diff --git a/src/pcre2/match.pyx b/src/pcre2/match.pyx
new file mode 100644 (file)
index 0000000..925a534
--- /dev/null
@@ -0,0 +1,259 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from enum import IntEnum
+from libc.stdint cimport uint32_t
+from libc.stdlib cimport malloc, free
+from cpython.unicode cimport PyUnicode_Check
+cimport cython
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+
+
+@cython.freelist(8)
+cdef class Match:
+    """
+    Object wrapper for a match block in PCRE2. Contains all relevant
+    information of a successful match. Attributes defined in match.pxd, see
+    below for an overview:
+        _mtch: Raw match data block, managed by PCRE2
+        _pattern: Pattern object used in match
+        _subj: Subject the pattern was matched against
+        _ofst: Byte offset (egardless of subject type) used in  match
+        _opts: Option bits used in match call
+    """
+
+    # =================================== #
+    #         Lifetime management         #
+    # =================================== #
+
+    def __cinit__(self):
+        self._mtch = NULL
+        self._pattern = None
+        self._subj = NULL
+        self._opts = 0
+
+
+    def __init__(self, *args, **kwargs):
+        # Prevent accidental instantiation from normal Python code since we
+        # cannot pass pointers into a Python constructor.
+        module = self.__class__.__module__
+        qualname = self.__class__.__qualname__
+        raise TypeError(f"Cannot create '{module}.{qualname}' instances")
+
+
+    def __dealloc__(self):
+        if self._subj is not NULL:
+            free_buffer(self._subj)
+        if self._mtch is not NULL:
+            pcre2_match_data_free(self._mtch)
+
+
+    @staticmethod
+    cdef Match _from_data(
+            pcre2_match_data_t *mtch,
+            Pattern pattern,
+            Py_buffer *subj,
+            size_t ofst,
+            uint32_t opts):
+        """ Factory function to create Match objects from C-type fields. The
+        ownership of the given pointers are stolen, which causes the extension
+        type to free them when the object is deallocated.
+        """
+
+        # Fast call to __new__() that bypasses the __init__() constructor.
+        cdef Match match = Match.__new__(Match)
+        match._mtch = mtch
+        match._pattern = pattern
+        match._subj = subj
+        match._ofst = ofst # Code unit offset
+        match._opts = opts
+        return match
+
+
+    # ========================== #
+    #         Properties         #
+    # ========================== #
+
+    @property
+    def options(self):
+        return self._opts
+
+    
+    @property
+    def subject(self):
+        return self._subj.obj
+
+    
+    @property
+    def pattern(self):
+        return self._pattern
+
+
+    # ======================= #
+    #         Methods         #
+    # ======================= #
+
+    def start(self, group=0):
+        """ Get the starting index of the matched substring, or of a specified
+        captured group.
+        """
+        ovec_count = pcre2_get_ovector_count(self._mtch)
+        ovec_table = pcre2_get_ovector_pointer(self._mtch)
+        
+        cdef int grp_num
+        cdef pcre2_sptr_t first_entry
+        cdef pcre2_sptr_t last_entry
+        if isinstance(group, int):
+            grp_num = group
+        else:
+            grp_name = get_buffer(group)
+            pcre2_substring_nametable_scan(
+                self._pattern._code, <pcre2_sptr_t>grp_name.buf, &first_entry, &last_entry
+            )
+            grp_num = (first_entry[0] << 8) | first_entry[1]
+            if grp_num < 0:
+                raise_from_rc(grp_num, None)
+            free_buffer(grp_name)
+
+        if grp_num > <int>ovec_count:
+            raise ValueError("Group referenced out of bounds")
+        start = ovec_table[2 * grp_num]
+
+        # Convert to code unit index as necessary.
+        if PyUnicode_Check(self._subj.obj):
+            _, start = codeunit_to_codepoint(self._subj, start, 0, 0)
+
+        return start
+
+
+    def end(self, group=0):
+        """ Get the ending index of the matched substring, or of a specified
+        captured group.
+        """
+        ovec_count = pcre2_get_ovector_count(self._mtch)
+        ovec_table = pcre2_get_ovector_pointer(self._mtch)
+        
+        cdef int grp_num
+        cdef pcre2_sptr_t first_entry
+        cdef pcre2_sptr_t last_entry
+        if isinstance(group, int):
+            grp_num = group
+        else:
+            grp_name = get_buffer(group)
+            pcre2_substring_nametable_scan(
+                self._pattern._code, <pcre2_sptr_t>grp_name.buf, &first_entry, &last_entry
+            )
+            grp_num = (first_entry[0] << 8) | first_entry[1]
+            if grp_num < 0:
+                raise_from_rc(grp_num, None)
+            free_buffer(grp_name)
+
+        if grp_num > <int>ovec_count:
+            raise ValueError("Group referenced out of bounds.")
+        end = ovec_table[2 * grp_num + 1]
+
+        # Convert to code unit index as necessary.
+        if PyUnicode_Check(self._subj.obj):
+            _, end = codeunit_to_codepoint(self._subj, end, 0, 0)
+
+        return end
+
+
+    def substring(self, group=0, default=""):
+        """ Get the full matched substring, or that of a specified captured
+        group.
+        """
+        cdef uint8_t *res
+        cdef size_t res_len
+        if isinstance(group, int):
+            grp_num = <uint32_t>group
+
+            # Handle unset matches and return default if none found
+            is_substring_set = pcre2_substring_length_bynumber(self._mtch, grp_num, NULL)
+            if is_substring_set < 0:
+                return default
+
+            get_rc = pcre2_substring_get_bynumber(self._mtch, grp_num, &res, &res_len)
+            if get_rc < 0:
+                raise_from_rc(get_rc, None)
+        else:
+            grp_name = get_buffer(group)
+
+            # Handle unset matches and return default if none found
+            is_substring_set = pcre2_substring_length_byname(
+                self._mtch, <pcre2_sptr_t>grp_name.buf, NULL
+            )
+            if is_substring_set < 0:
+                return default
+
+            get_rc = pcre2_substring_get_byname(
+                self._mtch, <pcre2_sptr_t>grp_name.buf, &res, &res_len
+            )
+            if get_rc < 0:
+                raise_from_rc(get_rc, None)
+            free_buffer(grp_name)
+
+        # Clean up result and convert to unicode as appropriate.
+        result = (<pcre2_sptr_t>res)[:res_len]
+        result = result.strip(b"\x00")
+        if PyUnicode_Check(self._subj.obj):
+            result = result.decode("utf-8")
+            
+        return result
+
+
+    def __getitem__(self, group):
+        """ Alias to substring.
+        """
+        return self.substring(group)
+
+
+    def expand(self, replacement, offset=0, options=0, low_memory=False):
+        """ Equivalent to calling substitute with the provided match. The type
+        of the subject determines the type of the returned string.
+        """
+        is_subj_utf = <bint>PyUnicode_Check(self._subj.obj)
+        is_repl_utf = <bint>PyUnicode_Check(replacement)
+        if is_subj_utf ^ is_repl_utf:
+            subj_type = "string" if is_subj_utf else "bytes-like"
+            repl_type = "string" if is_repl_utf else "bytes-like"
+            raise ValueError(f"Cannot use a {subj_type} subject with a {repl_type} replacement")
+
+        # Convert Python objects to C strings.
+        repl = get_buffer(replacement)
+        cdef size_t obj_ofst = <size_t>offset
+        cdef size_t ofst = obj_ofst
+        cdef uint32_t opts = <uint32_t>options | PCRE2_SUBSTITUTE_MATCHED
+        if is_subj_utf:
+            ofst, obj_ofst = codepoint_to_codeunit(self._subj, obj_ofst, 0, 0)
+
+        cdef size_t res_buf_len = 0
+        if not low_memory:
+            res_buf_len = self._subj.len + (self._subj.len // 2)
+
+        cdef int rc = 0
+        res, res_len = Pattern._substitute(
+            self._pattern._code, repl, self._subj, res_buf_len, ofst, opts, self._mtch, &rc
+        )
+        if res is NULL:
+            raise_from_rc(rc, None)
+
+        # Clean up result and convert to unicode as appropriate.
+        result = (<pcre2_sptr_t>res)[:res_len]
+        result = result.strip(b"\x00")
+        if is_subj_utf:
+            result = result.decode("utf-8")
+        
+        free(res)
+        free_buffer(repl)
+        return result
+
+    def groups(self, default=""):
+        """ Return a tuple containing all the subgroups of the match, from 1 up to however many
+        groups are in the pattern.
+        """
+        return tuple(self.substring(g, default=default) for g in range(self.pattern.capture_count))
diff --git a/src/pcre2/methods.pxd b/src/pcre2/methods.pxd
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/src/pcre2/methods.pyx b/src/pcre2/methods.pyx
new file mode 100644 (file)
index 0000000..2dab35e
--- /dev/null
@@ -0,0 +1,94 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint32_t
+from cpython cimport Py_buffer
+from cpython.unicode cimport PyUnicode_Check
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+from .match cimport Match
+
+
+def compile(pattern, options=0, jit=False):
+    """ Factory function to compile regular expressions into Pattern objects.
+    See the following PCRE2 documentation for a brief overview of the relevant
+    options:
+        http://pcre.org/current/doc/html/pcre2_compile.html
+    """
+    
+    cdef Py_buffer *patn = get_buffer(pattern)
+    cdef uint32_t opts = <uint32_t>options
+
+    # Ensure unicode strings are processed with UTF-8 support.
+    if PyUnicode_Check(pattern):
+        opts = opts | PCRE2_UTF
+
+    cdef int compile_rc
+    cdef size_t compile_errpos
+    cdef pcre2_code_t *code = pcre2_compile(
+        <pcre2_sptr_t>patn.buf, <size_t>patn.len, opts, &compile_rc, &compile_errpos, NULL
+    )
+
+    if code is NULL:
+        # If source was a unicode string, use the code point offset.
+        if PyUnicode_Check(pattern):
+            _, compile_errpos = codeunit_to_codepoint(patn, compile_errpos, 0, 0)
+        additional_msg = f"Compilation failed at position {compile_errpos!r}"
+        raise_from_rc(compile_rc, additional_msg)
+
+    pattern_obj = Pattern._from_data(code, patn, opts)
+    if jit:
+        pattern_obj.jit_compile()
+    return pattern_obj
+
+
+def findall(pattern, subject, offset=0, options=0, jit=True):
+    """ Shorthand for compiling a pattern, then calling findall. Note that this
+    will use JIT compilation.
+    """
+    return compile(pattern, options=options, jit=jit).findall(subject, offset=offset)
+
+
+def match(pattern, subject, offset=0, options=0, jit=False):
+    """ Shorthand for compiling a pattern, then calling match.
+    """
+    return compile(pattern, options=options, jit=jit).match(subject, offset=offset)
+
+
+def scan(pattern, subject, offset=0, options=0, jit=True):
+    """ Shorthand for compiling a pattern, then calling scan. Note that this
+    will use JIT compilation.
+    """
+    return compile(pattern, options=options, jit=jit).scan(subject, offset=offset)
+
+
+def split(pattern, subject, maxsplit=0, offset=0, options=0, jit=True):
+    """ Shorthand for compiling a pattern, then calling split. Note that this
+    will use JIT compilation.
+    """
+    pattern_obj = compile(pattern, options=options, jit=jit)
+    return pattern_obj.split(subject, maxsplit=maxsplit, offset=offset)
+
+
+def substitute(
+    pattern,
+    replacement,
+    subject,
+    offset=0,
+    suball=True,
+    literal=False,
+    low_memory=False,
+    options=0,
+    jit=True
+):
+    """ Shorthand for compiling a pattern, then calling substitute.
+    """
+    pattern_obj = compile(pattern, options=options, jit=jit)
+    if suball:
+        pattern_obj.jit_compile()
+    return pattern_obj.substitute(
+        replacement, subject, offset=offset, suball=suball, literal=literal, low_memory=low_memory
+    )
diff --git a/src/pcre2/pattern.pxd b/src/pcre2/pattern.pxd
new file mode 100644 (file)
index 0000000..258e138
--- /dev/null
@@ -0,0 +1,38 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+from libc.stdint cimport uint32_t
+
+# Local imports.
+from .libpcre2 cimport *
+
+
+cdef class Pattern:
+    cdef pcre2_code_t *_code
+    cdef Py_buffer *_patn
+    cdef uint32_t _opts
+    cdef bint _jitc
+
+    @staticmethod
+    cdef Pattern _from_data(
+        pcre2_code_t *code, Py_buffer *patn, uint32_t opts
+    )
+
+    @staticmethod
+    cdef uint32_t _info_uint(pcre2_code_t *code, uint32_t what) except *
+    @staticmethod
+    cdef size_t _info_size(pcre2_code_t *code, uint32_t what) except *
+    @staticmethod
+    cdef bint _info_bint(pcre2_code_t *code, uint32_t what) except *
+
+    @staticmethod
+    cdef pcre2_match_data_t * _match(
+        pcre2_code_t *code, Py_buffer *subj, size_t ofst, uint32_t opts, int *rc
+    )
+
+    @staticmethod
+    cdef (uint8_t *, size_t) _substitute(
+        pcre2_code_t *code, Py_buffer *repl, Py_buffer *subj, size_t res_buf_len,
+        size_t ofst, uint32_t opts, pcre2_match_data_t *mtch, int *rc
+    )
diff --git a/src/pcre2/pattern.pyx b/src/pcre2/pattern.pyx
new file mode 100644 (file)
index 0000000..66ce24b
--- /dev/null
@@ -0,0 +1,485 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint32_t
+from libc.stdlib cimport malloc, free
+from cpython cimport Py_buffer
+from cpython cimport array
+from cpython.unicode cimport PyUnicode_Check
+from cpython.memoryview cimport PyMemoryView_FromMemory
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .match cimport Match
+from .scanner cimport Scanner
+from .consts import BsrChar, NewlineChar
+
+
+def _rebuild(pattern, code_bytes_obj, options):
+    """ Deserializes code object to allow for unpickling.
+    """
+    patn = get_buffer(pattern)
+    opts = <uint32_t>options
+    code_buf = get_buffer(code_bytes_obj)
+    
+    cdef pcre2_code_t *code
+    number_of_codes = pcre2_serialize_decode(&code, 1, <const uint8_t *>code_buf.buf, NULL)
+    if number_of_codes < 0:
+        raise_from_rc(number_of_codes, None)
+
+    return Pattern._from_data(code, patn, opts)
+
+
+cdef class Pattern:
+    """
+    Object wrapper for a compiled pattern (known as a code struct) in PCRE2.
+    Attributes defined in pattern.pxd, see below for an overview:
+        _code: Raw compiled pattern, managed by PCRE2
+        _patn: Python object passed to compile
+        _opts: Option bits passed to compile call
+        _jitc: Indicator if pattern was JIT compiled
+    """
+
+    # =================================== #
+    #         Lifetime management         #
+    # =================================== #
+
+    def __cinit__(self):
+        self._code = NULL
+        self._patn = NULL
+        self._opts = 0
+        self._jitc = False
+
+
+    def __init__(self, *args, **kwargs):
+        # Prevent accidental instantiation from normal Python code since we
+        # cannot pass pointers into a Python constructor.
+        module = self.__class__.__module__
+        qualname = self.__class__.__qualname__
+        raise TypeError(f"Cannot create '{module}.{qualname}' instances")
+
+
+    def __dealloc__(self):
+        if self._patn is not NULL:
+            free_buffer(self._patn)
+        if self._code is not NULL:
+            pcre2_code_free(self._code)
+
+
+    @staticmethod
+    cdef Pattern _from_data(pcre2_code_t *code, Py_buffer *patn, uint32_t opts):
+        """ Factory function to create Pattern objects from C-type fields. The
+        ownership of the given pointers are stolen, which causes the extension
+        type to free them when the object is deallocated.
+        """
+        # Fast call to __new__() that bypasses the __init__() constructor.
+        cdef Pattern pattern = Pattern.__new__(Pattern)
+        pattern._code = code
+        pattern._patn = patn
+        pattern._opts = opts
+        return pattern
+
+
+    # ========================================= #
+    #         Serialize and deserialize         #
+    # ========================================= #
+
+    def __reduce__(self):
+        """ Serializes code object to allow for pickling.
+        """
+        cdef uint8_t *code_bytes
+        cdef size_t code_count
+        serialize_rc = pcre2_serialize_encode(
+            <const pcre2_code_t **>&self._code, 1, &code_bytes, &code_count, NULL
+        )
+        if serialize_rc < 0:
+            raise_from_rc(serialize_rc, None)
+
+        return (_rebuild, (self._patn.obj, code_bytes[:code_count], self._opts))
+
+
+    # =================================== #
+    #         Pattern information         #
+    # =================================== #
+
+    @staticmethod
+    cdef uint32_t _info_uint(pcre2_code_t *code, uint32_t what) except *:
+        """ Safely access pattern info returned as uint32_t.
+        """
+        cdef uint32_t where
+        pattern_info_rc = pcre2_pattern_info(code, what, &where)
+        if pattern_info_rc < 0:
+            raise_from_rc(pattern_info_rc, None)
+        return where
+
+    @staticmethod
+    cdef size_t _info_size(pcre2_code_t *code, uint32_t what) except *:
+        """ Safely access pattern info returned as size_t.
+        """
+        cdef size_t where
+        pattern_info_rc = pcre2_pattern_info(code, what, &where)
+        if pattern_info_rc < 0:
+            raise_from_rc(pattern_info_rc, None)
+        return where
+
+    @staticmethod
+    cdef bint _info_bint(pcre2_code_t *code, uint32_t what) except *:
+        """ Safely access pattern info returned as bint.
+        """
+        cdef bint where
+        pattern_info_rc = pcre2_pattern_info(code, what, &where)
+        if pattern_info_rc < 0:
+            raise_from_rc(pattern_info_rc, None)
+        return where
+
+
+    @property
+    def pattern(self):
+        """ Return the pattern the object was compiled with.
+        """
+        return self._patn.obj
+
+
+    @property
+    def options(self):
+        """ Returns the compile options as modified by any top-level (*XXX)
+        option settings such as (*UTF) at the start of the pattern itself.
+        """
+        return Pattern._info_uint(self._code, PCRE2_INFO_ALLOPTIONS)
+
+
+    @property
+    def backslash_r(self):
+        """ Return an indicator to what character sequences the \R escape
+        sequence matches.
+        """
+        bsr = Pattern._info_uint(self._code, PCRE2_INFO_BSR)
+        return BsrChar(bsr)
+
+
+    @property
+    def capture_count(self):
+        """ Returns the highest capture group number in the pattern. In
+        patterns where `(?|` is not used, this is also the total number of
+        capture groups.
+        """
+        return Pattern._info_uint(self._code, PCRE2_INFO_CAPTURECOUNT)
+
+
+    @property
+    def jit_size(self):
+        """ If the compiled pattern was successfully JIT compiled, return the
+        size of the JIT compiled code, otherwise return zero.
+        """
+        return Pattern._info_size(self._code, PCRE2_INFO_JITSIZE)
+
+    @property
+    def min_length(self):
+        """ Returns the minimum number of characters of matching subject strings.
+        """
+        return Pattern._info_uint(self._code, PCRE2_INFO_MINLENGTH)
+
+    
+    @property
+    def name_count(self):
+        """ Returns the number of named capture groups.
+        """
+        return Pattern._info_uint(self._code, PCRE2_INFO_NAMECOUNT)
+
+
+    @property
+    def newline(self):
+        """ Returns the type of character sequence that will be recognized as
+        a newline while matching.
+        """
+        newline = Pattern._info_uint(self._code, PCRE2_INFO_NEWLINE)
+        return NewlineChar(newline)
+
+
+    @property
+    def size(self):
+        """ Returns the size of the compiled pattern in bytes.
+        """
+        return Pattern._info_size(self._code, PCRE2_INFO_SIZE)
+
+
+    def name_dict(self):
+        """ Returns a mapping from capture group number to capture group name.
+        """
+        # Get name table related information.
+        name_count = Pattern._info_uint(self._code, PCRE2_INFO_NAMECOUNT)
+        name_entry_size = Pattern._info_uint(self._code, PCRE2_INFO_NAMEENTRYSIZE)
+
+        cdef pcre2_sptr_t name_table
+        pattern_info_rc = pcre2_pattern_info(self._code, PCRE2_INFO_NAMETABLE, &name_table)
+        if pattern_info_rc < 0:
+            raise_from_rc(pattern_info_rc, None)
+
+        # Convert byte table to dictionary.
+        name_dict = {}
+        cdef uint32_t i
+        for i in range(name_count):
+            offset = i * name_entry_size
+
+            # First two bytes of name table contain index, followed by possibly
+            # unicode byte string.
+            entry_idx = int((name_table[offset] << 8) | name_table[offset + 1])
+            entry_name = name_table[offset + 2:offset + name_entry_size]
+
+            # Clean up entry and convert to unicode as appropriate.
+            entry_name = entry_name.strip(b"\x00")
+            if PyUnicode_Check(self._patn.obj):
+                entry_name = entry_name.decode("utf-8")
+
+            name_dict[entry_idx] = entry_name
+
+        return name_dict
+
+
+    # ======================= #
+    #         Methods         #
+    # ======================= #
+
+    def jit_compile(self):
+        """ JIT compile the pattern.
+        """
+        jit_compile_rc = pcre2_jit_compile(self._code, PCRE2_JIT_COMPLETE)
+        if jit_compile_rc < 0:
+            raise_from_rc(jit_compile_rc, None)
+        self._jitc = True
+
+    
+    @staticmethod
+    cdef pcre2_match_data_t * _match(
+            pcre2_code_t *code,
+            Py_buffer *subj,
+            size_t ofst,
+            uint32_t opts,
+            int *rc):
+        """ Safe wrapper around calling PCRE2 function directly.
+        """
+        # Allocate memory for match.
+        mtch = pcre2_match_data_create_from_pattern(code, NULL)
+        if mtch is NULL:
+            rc[0] = PCRE2_ERROR_NOMEMORY
+            return NULL
+        
+        # Attempt match of pattern onto subject.
+        rc[0] = pcre2_match(
+            code, <pcre2_sptr_t>subj.buf, <size_t>subj.len,
+            ofst, opts, mtch, NULL
+        )
+        return mtch
+
+
+    def findall(self, subject, offset=0):
+        """
+        Return all non-overlapping matches of our pattern in subject, as a list of strings or tuples.
+
+        The string is scanned left-to-right, and matches are returned in the
+        order found. Empty matches are included in the result.
+
+        The result depends on the number of capturing groups in the pattern.
+        If there are no groups, return a list of strings matching the whole
+        pattern. If there is exactly one group, return a list of strings
+        matching that group. If multiple groups are present, return a list of
+        tuples of strings matching the groups. Non-capturing groups do not
+        affect the form of the result.
+        """
+        matches = self.scan(subject, offset=offset)
+        if self.capture_count == 0:
+            return [m.substring() for m in matches]
+        elif self.capture_count == 1:
+            return [m.substring(1) for m in matches]
+        result = []
+        for m in matches:
+            result.append(tuple(m.substring(g) for g in range(self.capture_count)))
+        return result
+
+
+    def match(self, subject, offset=0):
+        """ If match exists, returns the corresponding Match object. Otherwise
+        a MatchError is thrown in the case of no matches. See the following
+        PCRE2 documentation for a brief overview of the relevant options:
+            http://pcre.org/current/doc/html/pcre2_match.html
+        """
+        cdef bint is_patn_utf = PyUnicode_Check(self._patn.obj)
+        cdef bint is_subj_utf = PyUnicode_Check(subject)
+        if is_patn_utf ^ is_subj_utf:
+            patn_type = "string" if is_patn_utf else "bytes-like"
+            subj_type = "string" if is_subj_utf else "bytes-like"
+            raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject")
+
+        cdef Py_buffer *subj = get_buffer(subject)
+        cdef size_t obj_ofst = <size_t>offset
+        cdef size_t ofst = obj_ofst
+        cdef uint32_t opts = 0
+
+        # Convert indices accordingly.
+        if is_subj_utf:
+            ofst, obj_ofst = codepoint_to_codeunit(subj, obj_ofst, 0, 0)
+
+        cdef int match_rc = 0
+        mtch = Pattern._match(self._code, subj, ofst, opts, &match_rc)
+        if match_rc < 0:
+            raise_from_rc(match_rc, None)
+            
+        return Match._from_data(mtch, self, subj, ofst, opts)
+
+
+    def scan(self, subject, offset=0):
+        """ Returns iterator over all non-overlapping matches in a subject,
+        yielding Match objects.
+        """
+        cdef bint is_patn_utf = PyUnicode_Check(self._patn.obj)
+        cdef bint is_subj_utf = PyUnicode_Check(subject)
+        if is_patn_utf ^ is_subj_utf:
+            patn_type = "string" if is_patn_utf else "bytes-like"
+            subj_type = "string" if is_subj_utf else "bytes-like"
+            raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject")
+
+        subj = get_buffer(subject)
+        return Scanner._from_data(self, subj, offset)
+
+
+    def split(self, subject, maxsplit=0, offset=0):
+        """
+        Split subject by occurances of our pattern.
+
+        If capturing parentheses are used in pattern, then the text of all
+        groups in the pattern are also returned as part of the resulting list.
+        If maxsplit is nonzero, at most maxsplit splits occur, and the
+        remainder of the string is returned as the final element of the list.
+
+        If there are capturing groups in the separator and it matches at the
+        start of the string, the result will start with an empty string. The
+        same holds for the end of the string.
+
+        That way, separator components are always found at the same relative
+        indices within the result list.
+
+        Empty matches for the pattern split the string only when not adjacent
+        to a previous empty match.
+        """
+        output = []
+        pos = n = 0
+        for match in self.scan(subject, offset=offset):
+            start = match.start()
+            end = match.end()
+            if start != end:
+                output.append(subject[pos:start])
+                output.extend(match.groups())
+                pos = end
+                n += 1
+                if 0 < maxsplit <= n:
+                    break
+        output.append(subject[pos:])
+        return output
+
+
+    @staticmethod
+    cdef (uint8_t *, size_t) _substitute(
+            pcre2_code_t *code,
+            Py_buffer *repl,
+            Py_buffer *subj,
+            size_t res_buf_len,
+            size_t ofst,
+            uint32_t opts,
+            pcre2_match_data_t *mtch,
+            int *rc):
+        """ Safe wrapper around calling PCRE2 function directly.
+        """
+        cdef size_t res_len = res_buf_len
+        cdef uint8_t *res
+        res = <uint8_t *>malloc(res_len * sizeof(uint8_t))
+        substitute_rc = pcre2_substitute(
+            code,
+            <pcre2_sptr_t>subj.buf, <size_t>subj.len,
+            ofst, opts | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, mtch, NULL,
+            <pcre2_sptr_t>repl.buf, <size_t>repl.len,
+            res, &res_len
+        )
+        # Reattempt substitution, now with required size of buffer known.
+        if substitute_rc == PCRE2_ERROR_NOMEMORY:
+            free(res)
+            res = <uint8_t *>malloc(res_len * sizeof(uint8_t))
+            substitute_rc = pcre2_substitute(
+                code,
+                <pcre2_sptr_t>subj.buf, <size_t>subj.len,
+                ofst, opts, mtch, NULL,
+                <pcre2_sptr_t>repl.buf, <size_t>repl.len,
+                res, &res_len
+            )
+        # Capture return codes from both substitute attempts.
+        if substitute_rc < 0:
+            free(res)
+            free_buffer(subj)
+            free_buffer(repl)
+            rc[0] = substitute_rc
+            return NULL, 0
+        
+        return res, res_len
+
+
+    def substitute(
+        self, replacement, subject, offset=0, suball=True, literal=False, low_memory=False
+    ):
+        """ Returns the string obtained by replaces matches in subject with a
+        replacement. Note that option bits can significantly change the
+        functions behavior. See the following PCRE2 documentation for a brief
+        overview of the relevant options:
+            http://pcre.org/current/doc/html/pcre2_substitute.html
+        """
+        is_patn_utf = <bint>PyUnicode_Check(self._patn.obj)
+        is_subj_utf = <bint>PyUnicode_Check(subject)
+        is_repl_utf = <bint>PyUnicode_Check(replacement)
+        if is_subj_utf ^ is_repl_utf:
+            subj_type = "string" if is_subj_utf else "bytes-like"
+            repl_type = "string" if is_repl_utf else "bytes-like"
+            raise ValueError(f"Cannot use a {subj_type} subject with a {repl_type} replacement")
+        if is_patn_utf ^ is_subj_utf:
+            patn_type = "string" if is_patn_utf else "bytes-like"
+            subj_type = "string" if is_subj_utf else "bytes-like"
+            raise ValueError(f"Cannot use a {patn_type} pattern with a {subj_type} subject")
+
+        # Convert Python objects to C types.
+        subj = get_buffer(subject)
+        repl = get_buffer(replacement)
+        cdef size_t obj_ofst = <size_t>offset
+        cdef size_t ofst = obj_ofst
+        
+        # Fill options from flags
+        cdef uint32_t opts = 0
+        if suball:
+            opts = opts | PCRE2_SUBSTITUTE_GLOBAL
+        if literal:
+            opts = opts | PCRE2_SUBSTITUTE_LITERAL
+
+        # Always replace unmatched groups with an empty string to match behavior of re
+        opts = opts | PCRE2_SUBSTITUTE_UNSET_EMPTY
+
+        if is_subj_utf:
+            ofst, obj_ofst = codepoint_to_codeunit(subj, obj_ofst, 0, 0)
+
+        cdef size_t res_buf_len = 0
+        if not low_memory:
+            res_buf_len = subj.len + (subj.len // 2)
+
+        cdef int rc = 0
+        res, res_len = Pattern._substitute(
+            self._code, repl, subj, res_buf_len, ofst, opts, NULL, &rc
+        )
+        if res is NULL:
+            raise_from_rc(rc, None)
+
+        # Clean up result and convert to unicode as appropriate.
+        result = (<pcre2_sptr_t>res)[:res_len]
+        result = result.strip(b"\x00")
+        if is_subj_utf:
+            result = result.decode("utf-8")
+        
+        free(res)
+        free_buffer(subj)
+        free_buffer(repl)
+        return result
diff --git a/src/pcre2/scanner.pxd b/src/pcre2/scanner.pxd
new file mode 100644 (file)
index 0000000..475228f
--- /dev/null
@@ -0,0 +1,26 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+from libc.stdint cimport uint32_t
+
+# Local imports.
+from .libpcre2 cimport *
+from .pattern cimport Pattern
+
+
+cdef class Scanner:
+    cdef Pattern _pattern
+    cdef Py_buffer *_subj
+
+    cdef bint _is_crlf_newline
+    cdef bint _is_patn_utf
+
+    cdef uint32_t _state_opts
+    cdef size_t _state_ofst
+    cdef size_t _state_obj_ofst
+
+    @staticmethod
+    cdef Scanner _from_data(
+        Pattern pattern, Py_buffer *subject, size_t offset
+    )
diff --git a/src/pcre2/scanner.pyx b/src/pcre2/scanner.pyx
new file mode 100644 (file)
index 0000000..b6da2b1
--- /dev/null
@@ -0,0 +1,173 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdint cimport uint32_t
+from libc.stdlib cimport malloc, free
+from cpython cimport Py_buffer
+from cpython cimport array
+from cpython.unicode cimport PyUnicode_Check
+from cpython.memoryview cimport PyMemoryView_FromMemory
+
+# Local imports.
+from .utils cimport *
+from .libpcre2 cimport *
+from .match cimport Match
+from .pattern cimport Pattern
+from .consts import BsrChar, NewlineChar
+
+
+cdef class Scanner:
+    """ Iterator object that scans a subject all non-overlapping matches of a
+    pattern. Attributes defined in scanner.pxd, see below for an overview:
+        _pattern: Pattern object to use for matching
+        _subj: Subject to scan
+        _is_crlf_newline: Whether the character sequence CRLF denotes a newline
+        _is_patn_utf: Whether the pattern was compiled with UTF support
+        _state_opts: Options to pass to match
+        _state_ofst: Byte offset to match at
+        _state_obj_ofst: Object offset to match at
+    """
+
+
+    # =================================== #
+    #         Lifetime management         #
+    # =================================== #
+
+    def __cinit__(self):
+        self._pattern = None
+        self._subj = NULL
+
+        self._is_patn_utf = False
+        self._is_crlf_newline = False
+
+        self._state_opts = 0
+        self._state_ofst = 0
+        self._state_obj_ofst = 0
+
+
+    def __init__(self, *args, **kwargs):
+        # Prevent accidental instantiation from normal Python code since we
+        # cannot pass pointers into a Python constructor.
+        module = self.__class__.__module__
+        qualname = self.__class__.__qualname__
+        raise TypeError(f"Cannot create '{module}.{qualname}' instances")
+
+
+    def __dealloc__(self):
+        if self._subj is not NULL:
+            free_buffer(self._subj)
+
+
+    @staticmethod
+    cdef Scanner _from_data(Pattern pattern, Py_buffer *subj, size_t offset):
+        """ Factory function to create Scanner objects from C-type fields. The
+        ownership of the given pointers are stolen, which causes the extension
+        type to free them when the object is deallocated.
+        """
+        # Fast call to __new__() that bypasses the __init__() constructor.
+        cdef Scanner scanner = Scanner.__new__(Scanner)
+        scanner._pattern = pattern
+        scanner._subj = subj
+
+        patn_opts = Pattern._info_uint(pattern._code, PCRE2_INFO_ALLOPTIONS)
+        scanner._is_patn_utf = (patn_opts & PCRE2_UTF) != 0
+        newline = Pattern._info_uint(pattern._code, PCRE2_INFO_NEWLINE)
+        scanner._is_crlf_newline = (
+            newline == PCRE2_NEWLINE_ANY or
+            newline == PCRE2_NEWLINE_CRLF or
+            newline == PCRE2_NEWLINE_ANYCRLF
+        )
+        scanner._state_opts = 0
+
+        # Compute and set byte equivalent offset.
+        if scanner._is_patn_utf:
+            ofst, obj_ofst = codepoint_to_codeunit(scanner._subj, offset, 0, 0)
+            scanner._state_ofst = ofst
+            scanner._state_obj_ofst = obj_ofst
+        else:
+            scanner._state_obj_ofst = offset
+            scanner._state_ofst = scanner._state_obj_ofst
+        return scanner
+
+
+    # ======================================== #
+    #         Iteration implementation         #
+    # ======================================== #
+
+    def __iter__(self):
+        return self
+
+
+    def __next__(self):
+        """ Yields next match object found in subject.
+        """
+        if self._state_obj_ofst > self._subj.len:
+            raise StopIteration
+
+        # Attempt match of pattern onto subject.
+        match_rc = <int>0
+        mtch = Pattern._match(
+            self._pattern._code, self._subj, self._state_ofst, self._state_opts, &match_rc
+        )
+
+        # Handle no matches in result.
+        if match_rc == PCRE2_ERROR_NOMATCH:
+            # Default match is not achored so if no match found at current offset, then there
+            # will not be any ahead either.
+            if self._state_opts == 0:
+                pcre2_match_data_free(mtch)
+                raise StopIteration
+
+            # Reset options so empty strings can match at next offset.
+            self._state_opts = 0
+
+            # Increment to next character and handle possible CRLF newlines.
+            obj_ofst_increment = 1
+            if self._is_crlf_newline and (self._state_ofst + 1) < self._subj.len:
+                if (<bytes>self._subj.buf)[self._state_ofst:self._state_ofst + 2] == b"\r\n":
+                    obj_ofst_increment += 1
+
+            # Convert indices accordingly.
+            if self._is_patn_utf:
+                self._state_ofst, self._state_obj_ofst = codepoint_to_codeunit(
+                    self._subj,
+                    self._state_obj_ofst + obj_ofst_increment,
+                    self._state_ofst,
+                    self._state_obj_ofst
+                )
+            else:
+                self._state_obj_ofst = self._state_obj_ofst + obj_ofst_increment
+                self._state_ofst = self._state_obj_ofst
+
+            pcre2_match_data_free(mtch)
+            return self.__next__()
+
+        # Handle all other errors.
+        elif mtch is NULL or match_rc < 0:
+            pcre2_match_data_free(mtch)
+            raise_from_rc(match_rc, None)
+
+        # If the match was successful.
+        else:
+            ovec_table = pcre2_get_ovector_pointer(mtch)
+            mtch_end = ovec_table[1]
+
+            if self._state_ofst == mtch_end:
+                # If the matched string is empty ensure next is not.
+                self._state_opts = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED
+            else:
+                # Convert the end in the byte string to the end in the object.
+                self._state_opts = 0
+                if self._is_patn_utf:
+                    self._state_ofst, self._state_obj_ofst = codeunit_to_codepoint(
+                        self._subj, mtch_end, self._state_ofst, self._state_obj_ofst
+                    )
+                else:
+                    self._state_obj_ofst = mtch_end
+                    self._state_ofst = self._state_obj_ofst
+
+            # Create new buffer for match object to own
+            subj_copy = get_buffer(self._subj.obj)
+            return Match._from_data(
+                mtch, self._pattern, subj_copy, self._state_ofst, self._state_opts
+            )
diff --git a/src/pcre2/utils.pxd b/src/pcre2/utils.pxd
new file mode 100755 (executable)
index 0000000..1d4d898
--- /dev/null
@@ -0,0 +1,22 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from cpython cimport Py_buffer
+
+
+cdef int free_buffer(Py_buffer *pybuf)
+
+cdef Py_buffer * get_buffer(object obj) except NULL
+
+cdef (size_t, size_t) codeunit_to_codepoint(
+    Py_buffer *pybuf,
+    size_t codeunit_idx,
+    size_t cur_codeunit_idx, size_t cur_codepoint_idx
+)
+cdef (size_t, size_t) codepoint_to_codeunit(
+    Py_buffer *pybuf,
+    size_t codepoint_idx,
+    size_t cur_codeunit_idx, size_t cur_codepoint_idx
+)
+
+cdef void * raise_from_rc(int errorcode, object context_msg) except NULL
diff --git a/src/pcre2/utils.pyx b/src/pcre2/utils.pyx
new file mode 100755 (executable)
index 0000000..1d7dde3
--- /dev/null
@@ -0,0 +1,115 @@
+# -*- coding:utf-8 -*-
+
+# Standard libraries.
+from libc.stdlib cimport malloc, free
+from libc.stdint cimport uint8_t
+from cpython cimport Py_buffer
+from cpython.buffer cimport (
+    PyObject_CheckBuffer,
+    PyBuffer_IsContiguous,
+    PyObject_GetBuffer,
+    PyBuffer_FillInfo,
+    PyBuffer_Release
+)
+from cpython.unicode cimport (
+    PyUnicode_Check
+)
+cdef extern from "Python.h":
+    int PyUnicode_1BYTE_KIND
+    int PyUnicode_2BYTE_KIND
+    int PyUnicode_4BYTE_KIND
+    unsigned int PyUnicode_KIND(object o)
+    void *PyUnicode_DATA(object o)
+    const char * PyUnicode_AsUTF8AndSize(object unicode, Py_ssize_t *size)
+
+# Local imports.
+from .libpcre2 cimport *
+from .exceptions import LibraryError, CompileError, MatchError
+
+
+cdef int free_buffer(Py_buffer *pybuf):
+    """ Safe free a buffer pointer, ensuring it first releases it's reference
+    """
+    if pybuf is not NULL:
+        PyBuffer_Release(pybuf)
+        free(pybuf)
+    return 0
+
+
+cdef Py_buffer * get_buffer(object obj) except NULL:
+    """ Get a Python buffer from an object, encoding via UTF-8 if unicode
+    based
+    """
+    cdef const char *sptr = NULL
+    cdef Py_ssize_t length = 0
+
+    pybuf = <Py_buffer *>malloc(sizeof(Py_buffer))
+    if not pybuf:
+        raise MemoryError()
+
+    # Process unicode and derivative objects.
+    if PyUnicode_Check(obj):
+        sptr = PyUnicode_AsUTF8AndSize(obj, &length)
+        fill_buf_rc = PyBuffer_FillInfo(pybuf, obj, <void *>sptr, length, 1, 0)
+        if fill_buf_rc < 0:
+            free_buffer(pybuf)
+            raise ValueError("Could not fill internal buffer")
+    
+    # Handle all other bytes-like objects.
+    else:
+        if PyObject_CheckBuffer(obj):
+            get_buffer_rc = PyObject_GetBuffer(obj, pybuf, 0)
+            if not PyBuffer_IsContiguous(pybuf, b"A"):
+                free_buffer(pybuf)
+                raise ValueError("Bytes-like object must be contiguous")
+        else:
+            free(pybuf)
+            raise ValueError("Input must be string or bytes-like")
+
+    return pybuf
+
+
+cdef (size_t, size_t) codeunit_to_codepoint(
+    Py_buffer *pybuf,
+    size_t codeunit_idx,
+    size_t cur_codeunit_idx, size_t cur_codepoint_idx
+):
+    """ Convert a code unit index to a code point index
+    """
+    while cur_codeunit_idx < codeunit_idx:
+        if (((<uint8_t *>pybuf.buf)[cur_codeunit_idx]) & 0xC0) != 0x80:
+            cur_codepoint_idx += 1
+        cur_codeunit_idx += 1
+    return cur_codeunit_idx, cur_codepoint_idx
+
+    
+cdef (size_t, size_t) codepoint_to_codeunit(
+    Py_buffer *pybuf,
+    size_t codepoint_idx,
+    size_t cur_codeunit_idx, size_t cur_codepoint_idx
+):
+    """
+    """
+    while cur_codepoint_idx < codepoint_idx:
+        cur_codeunit_idx += 1
+        if (((<uint8_t *>pybuf.buf)[cur_codeunit_idx]) & 0xC0) != 0x80:
+            cur_codepoint_idx += 1
+    return cur_codeunit_idx, cur_codepoint_idx
+
+
+cdef void * raise_from_rc(int errorcode, object context_msg) except NULL:
+    """ Raise the appropriate error type from the given error code
+
+    Raises one of the custom exception classes defined in this module. Each
+    exception corresponds to a set of error codes defined in PCRE2. Error
+    messages are retrieved from PCRE2.
+    """
+    # Match against error code classes.
+    if errorcode > 0:
+        raise CompileError(errorcode, context_msg)
+
+    elif errorcode == PCRE2_ERROR_NOMATCH or errorcode == PCRE2_ERROR_PARTIAL:
+        raise MatchError(errorcode, context_msg)
+
+    else:
+        raise LibraryError(errorcode, context_msg)
diff --git a/tests/test_groups.py b/tests/test_groups.py
new file mode 100644 (file)
index 0000000..2fcfd2e
--- /dev/null
@@ -0,0 +1,14 @@
+import pytest
+import pcre2
+from pcre2.exceptions import CompileError, MatchError, LibraryError
+
+def test_match_groups():
+    assert pcre2.match('a', 'a').groups() == ()
+    assert pcre2.match('(a)', 'a').groups() == ('a',)
+
+    assert pcre2.match(b'a', b'a').groups() == ()
+    assert pcre2.match(b'(a)', b'a').groups() == (b'a',)
+
+    for a in ("\xe0", "\u0430", "\U0001d49c"):
+        assert pcre2.match(a, a).groups() == ()
+        assert pcre2.match('(%s)' % a, a).groups() == (a,)
diff --git a/tests/test_match.py b/tests/test_match.py
new file mode 100644 (file)
index 0000000..8db098b
--- /dev/null
@@ -0,0 +1,40 @@
+import pytest
+import pcre2
+from pcre2.exceptions import CompileError, MatchError, LibraryError
+
+
+# All tests should match successfully.
+test_data_match_bounds = [
+    (b".*", "aba•ba••ba•••b".encode(), 0, 0, 0, 0, 26),
+    (".*", "aba•ba••ba•••b", 0, 0, 0, 0, 14),
+]
+@pytest.mark.parametrize("pattern,subject,options,offset,group,start,end", test_data_match_bounds)
+def test_match_bounds(pattern, subject, options, offset, group, start, end):
+    p = pcre2.compile(pattern, options=options)
+    m = p.match(subject, offset=offset)
+    assert (m.start(group), m.end(group)) == (start, end)
+
+
+test_data_match_substring = [
+    (b".*", "aba•ba••ba•••b".encode(), 0, 0, "aba•ba••ba•••b".encode()),
+    (".*", "aba•ba••ba•••b", 0, 0, "aba•ba••ba•••b"),
+]
+@pytest.mark.parametrize("pattern,subject,options,offset,substring", test_data_match_substring)
+def test_match_substring(pattern, subject, options, offset, substring):
+    p = pcre2.compile(pattern, options=options)
+    m = p.match(subject, offset=offset)
+    assert m.substring() == substring
+
+
+test_data_match_expand = [
+    (b"[abc]*", b"", b"dabacbaccbacccb", 0, 0, b"dabacbaccbacccb"),
+    ("[abc]*", "", "dabacbaccbacccb", 0, 0, "dabacbaccbacccb"),
+    ("[abc]*", "", "dabacbaccbacccb", 0, 1, "d"),
+]
+@pytest.mark.parametrize(
+    "pattern,replacement,subject,options,offset,result", test_data_match_expand
+)
+def test_match_expand(pattern, replacement, subject, options, offset, result):
+    p = pcre2.compile(pattern, options=options)
+    m = p.match(subject, offset=offset)
+    assert m.expand(replacement) == result
\ No newline at end of file
diff --git a/tests/test_pattern.py b/tests/test_pattern.py
new file mode 100644 (file)
index 0000000..d953228
--- /dev/null
@@ -0,0 +1,231 @@
+import pytest
+import pcre2
+from pcre2.exceptions import CompileError, MatchError, LibraryError
+from pcre2.consts import CompileOption
+
+
+test_data_pattern_compile_success = [
+    (b"a+b+c*d*", 0, "SUCCESS"),
+    (b"(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+    (b"(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+    ("å+∫+ç*∂*".encode(), 0, "SUCCESS"),
+    ("a+b+c*d*", 0, "SUCCESS"),
+    ("(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+    ("(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+    ("(?<<foo>a+b+)c*d*", 0, "COMPILE_ERROR"),
+    ("(?<foo>a+b+)c*d*(?<foo>a+b+)", 0, "COMPILE_ERROR"),
+    ("(?<foo>a+b+)c*d*(?<foo>a+b+)", pcre2.CompileOption.DUPNAMES, "SUCCESS"),
+    ("å+∫+ç*∂*", 0, "SUCCESS"),
+    ("(?<ƒøø>a+b+)c*d*", 0, "SUCCESS"),
+]
+@pytest.mark.parametrize("pattern,options,return_code", test_data_pattern_compile_success)
+def test_pattern_compile_success(pattern, options, return_code):
+    try:
+        p = pcre2.compile(pattern, options=options)
+        rc = "SUCCESS"
+        assert p.jit_size == 0
+    except CompileError as e:
+        rc = "COMPILE_ERROR"
+    except LibraryError as e:
+        rc = "LIB_ERROR"
+    assert rc == return_code
+
+@pytest.mark.parametrize("pattern,options,return_code", test_data_pattern_compile_success)
+def test_pattern_jit_compile_success(pattern, options, return_code):
+    try:
+        p = pcre2.compile(pattern, options=options, jit=True)
+        rc = "SUCCESS"
+        assert p.jit_size > 0
+    except CompileError as e:
+        rc = "COMPILE_ERROR"
+    except LibraryError as e:
+        rc = "LIB_ERROR"
+    assert rc == return_code
+
+
+test_data_pattern_name_dict = [
+    (b"(?<foo>a+b+)c*d*", 0, {1: b"foo"}),
+    ("(?<foo>a+b+)c*d*", 0, {1: "foo"}),
+    ("(?<ƒøø>a+b+)c*d*", 0, {1: "ƒøø"}),
+    ("(?<foo>a+b+)c*d*(?<bar>a+b+)", 0, {1: "foo", 2: "bar"}),
+    ("(?<foo>a+b+)c*(.+)d*(?<bar>a+b+)", 0, {1: "foo", 3: "bar"}),
+    ("(?<foo>a+b+)c*d*(?<foo>a+b+)", pcre2.CompileOption.DUPNAMES, {1: "foo", 2: "foo"}),
+]
+@pytest.mark.parametrize("pattern,options,name_dict", test_data_pattern_name_dict)
+def test_pattern_name_dict(pattern, options, name_dict):
+    p = pcre2.compile(pattern, options=options)
+    assert p.name_dict() == name_dict
+
+
+test_data_pattern_match_success = [
+    (b".*", b"abacbaccbacccb", 0, 0, "SUCCESS"),
+    (".*", "abacbaccbacccb", 0, 0, "SUCCESS"),
+    ("ac{3,}b", "abacbaccbacccb", 0, 0, "SUCCESS"),
+    ("a•{3,}b", "aba•ba••ba•••b", 0, 0, "SUCCESS"),
+    ("ab", "abacbaccbacccb", 0, 2, "MATCH_ERROR"),
+    ("((((((((((((((()))))))))))))))", "", 0, 0, "SUCCESS"),
+]
+@pytest.mark.parametrize(
+    "pattern,subject,options,offset,return_code", test_data_pattern_match_success
+)
+def test_pattern_match_success(pattern, subject, options, offset, return_code):
+    p = pcre2.compile(pattern, options=options)
+    try:
+        m = p.match(subject, offset=offset)
+        rc = "SUCCESS"
+    except MatchError as e:
+        rc = "MATCH_ERROR"
+    except LibraryError as e:
+        rc = "LIB_ERROR"
+    assert rc == return_code
+
+
+test_data_pattern_scan_length = [
+    (b".+", b"abacbaccbacccb", 0, 1),
+    (b".*", b"abacbaccbacccb", 0, 2),
+    (".+", "abacbaccbacccb", 0, 1),
+    (".*", "abacbaccbacccb", 0, 2),
+    ("[abc]*", "dabacbaccbacccb", 0, 3),
+    ("ac{2,}b", "abacbaccbacccb", 0, 2),
+    ("a•{2,}b", "aba•ba••ba•••b", 0, 2),
+    ("a•*b", "aba•ba••ba•••b", 0, 4),
+    ("ab", "abacbaccbacccb", 2, 0),
+]
+@pytest.mark.parametrize(
+    "pattern,subject,offset,iter_length", test_data_pattern_scan_length
+)
+def test_pattern_scan_length(pattern, subject, offset, iter_length):
+    p = pcre2.compile(pattern)
+    s = p.scan(subject, offset=offset)
+    assert len(list(iter(s))) == iter_length
+
+
+test_pattern_substitute = [
+    (b"[abc]*", b"", b"dabacbaccbacccb", False, False, 0, b"dabacbaccbacccb"),
+    ("[abc]*", "", "dabacbaccbacccb", False, False, 0, "dabacbaccbacccb"),
+    ("[abc]*", "", "dabacbaccbacccb", False, False, 1, "d"),
+    ("a(•{2,})b", "a•b", "aba•ba••ba•••b", True, False, 0, "aba•ba•ba•b"),
+    ("a(•{2,})b", "a$1b", "aba•ba••ba•••b", True, True, 0, "aba•ba$1ba$1b"),
+]
+@pytest.mark.parametrize(
+    "pattern,replacement,subject,suball,literal,offset,result", test_pattern_substitute
+)
+def test_pattern_substitute(pattern, replacement, subject, suball, literal, offset, result):
+    p = pcre2.compile(pattern)
+    assert p.substitute(replacement, subject, suball=suball, literal=literal, offset=offset) == result
+
+def test_pattern_findall():
+    p = pcre2.compile(r'(\w+)=(\d+)')
+    assert p.findall('set width=20 and height=10') == [('width=20', 'width'), ('height=10', 'height')]
+    s = bytes(range(128)).decode()
+    p2 = pcre2.compile(r'[0-9--1]')
+    assert p2.findall(s) == list('-./0123456789')
+    p3 = pcre2.compile(r'[%--1]')
+    assert p3.findall(s) == list("%&'()*+,-1")
+    p4 = pcre2.compile(r'[%--]')
+    assert p4.findall(s) == list("%&'()*+,-")
+    p5 = pcre2.compile(r'[0-9&&1]')
+    assert p5.findall(s) == list('&0123456789')
+    p6 = pcre2.compile(r'[\d&&1]')
+    assert p6.findall(s) == list('&0123456789')
+    p7 = pcre2.compile(r'[0-9||a]')
+    assert p7.findall(s) == list('0123456789a|')
+    p8 = pcre2.compile(r'[\d||a]')
+    assert p8.findall(s) == list('0123456789a|')
+    p9 = pcre2.compile(r'[0-9~~1]')
+    assert p9.findall(s) == list('0123456789~')
+    p10 = pcre2.compile(r'[\d~~1]')
+    assert p10.findall(s) == list('0123456789~')
+    p11 = pcre2.compile(r'[[0-9]|]')
+    assert p11.findall(s) == list('0123456789[]')
+
+    for reps in '*', '+', '?', '{1}':
+        for mod in '', '?':
+            pattern = '.' + reps + mod + 'yz'
+            assert pcre2.compile(pattern, pcre2.S).findall('xyz') == ['xyz'], pattern
+            pattern = pattern.encode()
+            assert pcre2.compile(pattern, pcre2.S).findall(b'xyz') == [b'xyz'], pattern
+
+
+def test_pattern_jit_findall():
+    assert pcre2.findall(r'(\w+)=(\d+)', 'set width=20 and height=10') == [('width=20', 'width'), ('height=10', 'height')]
+    assert pcre2.findall(":+", "abc") == []
+    assert pcre2.findall(":+", "a:b::c:::d") == [":", "::", ":::"]
+    assert pcre2.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"]
+
+    for x in ("\xe0", "\u0430", "\U0001d49c"):
+        xx = x * 2
+        xxx = x * 3
+        string = "a%sb%sc%sd" % (x, xx, xxx)
+        assert pcre2.findall("%s+" % x, string) == [x, xx, xxx]
+        assert pcre2.findall("(%s+)" % x, string) == [x, xx, xxx]
+
+    assert len(pcre2.findall(r"\b", "a")) == 2
+    assert len(pcre2.findall(r"\B", "a")) == 0
+    assert len(pcre2.findall(r"\b", " ")) == 0
+    assert len(pcre2.findall(r"\b", "   ")) == 0
+    assert len(pcre2.findall(r"\B", " ")) == 2
+
+    s = bytes(range(128)).decode()
+    assert pcre2.findall(r'[--1]', s) ==  list('-./01')
+    assert pcre2.findall(r'[&&1]', s) ==  list('&1')
+    assert pcre2.findall(r'[||1]', s) ==  list('1|')
+    assert pcre2.findall(r'[~~1]', s) ==  list('1~')
+
+    assert pcre2.findall(r"(?i)(a)\1", "aa \u0100") == ['a']
+
+    assert pcre2.findall(r'a++', 'aab') == ['aa']
+    assert pcre2.findall(r'a*+', 'aab') == ['aa', '', '']
+    assert pcre2.findall(r'a?+', 'aab') == ['a', 'a', '', '']
+    assert pcre2.findall(r'a{1,3}+', 'aab') == ['aa']
+
+    assert pcre2.findall(r'(?:ab)++', 'ababc') == ['abab']
+    assert pcre2.findall(r'(?:ab)*+', 'ababc') == ['abab', '', '']
+    assert pcre2.findall(r'(?:ab)?+', 'ababc') == ['ab', 'ab', '', '']
+    assert pcre2.findall(r'(?:ab){1,3}+', 'ababc') == ['abab']
+
+    assert pcre2.findall(r'(?>a+)', 'aab') == ['aa']
+    assert pcre2.findall(r'(?>a*)', 'aab') == ['aa', '', '']
+    assert pcre2.findall(r'(?>a?)', 'aab') == ['a', 'a', '', '']
+    assert pcre2.findall(r'(?>a{1,3})', 'aab') == ['aa']
+
+    assert pcre2.findall(r'(?>(?:ab)+)', 'ababc') == ['abab']
+    assert pcre2.findall(r'(?>(?:ab)*)', 'ababc') == ['abab', '', '']
+    assert pcre2.findall(r'(?>(?:ab)?)', 'ababc') == ['ab', 'ab', '', '']
+    assert pcre2.findall(r'(?>(?:ab){1,3})', 'ababc') == ['abab']
+
+    import re
+    b = 'y\u2620y\u2620y'.encode('utf-8')
+    assert len(pcre2.findall(re.escape('\u2620'.encode('utf-8')), b)) == 2
+
+
+def test_pattern_split():
+    pattern = "[\u002E\u3002\uFF0E\uFF61]"
+    assert pcre2.compile(pattern).split("a.b.c") == ['a','b','c']
+
+
+def test_pattern_jit_split():
+    assert pcre2.split(":", ":a:b::c") == ['', 'a', 'b', '', 'c']
+    assert pcre2.split(":+", ":a:b::c") == ['', 'a', 'b', 'c']
+    assert pcre2.split("(:+)", ":a:b::c") == ['', ':', 'a', ':', 'b', '::', 'c']
+
+    assert pcre2.split(b":", b":a:b::c") == [b'', b'a', b'b', b'', b'c']
+    assert pcre2.split(b":+", b":a:b::c") == [b'', b'a', b'b', b'c']
+    assert pcre2.split(b"(:+)", b":a:b::c") == [b'', b':', b'a', b':', b'b', b'::', b'c']
+
+    for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
+                    "\U0001d49c\U0001d49e\U0001d4b5"):
+        string = ":%s:%s::%s" % (a, b, c)
+        assert pcre2.split(":", string) == ['', a, b, '', c]
+        assert pcre2.split(":+", string) == ['', a, b, c]
+        assert pcre2.split("(:+)", string) == ['', ':', a, ':', b, '::', c]
+
+    assert pcre2.split("(?::+)", ":a:b::c") == ['', 'a', 'b', 'c']
+    assert pcre2.split("([b:]+)", ":a:b::c") == ['', ':', 'a', ':b::', 'c']
+    assert pcre2.split("(?:b)|(?::+)", ":a:b::c") == ['', 'a', '', '', 'c']
+
+    assert pcre2.split(":", ":a:b::c", 2) == ['', 'a', 'b::c']
+    assert pcre2.split(":", ":a:b::c", maxsplit=2) == ['', 'a', 'b::c']
+    assert pcre2.split(':', 'a:b:c:d', maxsplit=2) == ['a', 'b', 'c:d']
+    assert pcre2.split("(:)", ":a:b::c", maxsplit=2) == ['', ':', 'a', ':', 'b::c']
+    assert pcre2.split("(:+)", ":a:b::c", maxsplit=2) == ['', ':', 'a', ':', 'b::c']